diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 64dad526c0b8ce..92700b16855c82 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o compound.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o compound.o gds.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0e457991953d3e..f0fded02026c64 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1832,6 +1832,56 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, return err; } +static int fuse_notify_register_gds_netdev(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + char netdev_name[256]; + int err; + + err = -EINVAL; + if (size >= sizeof(netdev_name)) + goto copy_finish; + + err = fuse_copy_one(cs, netdev_name, size); + if (err) + goto copy_finish; + + netdev_name[size] = '\0'; + fuse_copy_finish(cs); + + err = fuse_dmabuf_register_netdev(fc, netdev_name); + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_unregister_gds_netdev(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + char netdev_name[256]; + int err; + + err = -EINVAL; + if (size >= sizeof(netdev_name)) + goto copy_finish; + + err = fuse_copy_one(cs, netdev_name, size); + if (err) + goto copy_finish; + + netdev_name[size] = '\0'; + fuse_copy_finish(cs); + + err = fuse_dmabuf_unregister_netdev(fc, netdev_name); + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1857,6 +1907,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); + case FUSE_NOTIFY_REGISTER_GDS_NETDEV: + return fuse_notify_register_gds_netdev(fc, size, cs); + + case FUSE_NOTIFY_UNREGISTER_GDS_NETDEV: + return fuse_notify_unregister_gds_netdev(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index ec91a33627d7f8..085eb73ffbddcc 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -18,9 +18,11 @@ module_param(enable_uring, bool, 0644); MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); -#define FUSE_URING_IOV_SEGS 2 /* header and payload */ +#define FUSE_URING_IOV_SEGS 3 /* header, payload and mr */ #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +#define FUSE_RING_PAYLOAD_MR_PG 2 +#define FUSE_URING_IOV_SEGS_COMPAT 2 /* header and payload */ /* Threshold that determines if a better queue should be searched for */ #define FUSE_URING_Q_THRESHOLD 2 @@ -231,6 +233,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) io_pages_free(&ent->header_pages, ent->nr_header_pages); io_pages_free(&ent->payload_pages, ent->nr_payload_pages); + fuse_dmabuf_clear_sgt(&ent->dmabuf_ent); kfree(ent); } @@ -837,6 +840,24 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, return err ? -EFAULT : 0; } +static void fuse_uring_prepare_mr(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_mr_in *mr = &req->args->mr.mr_in; + + if (req->args->is_gds) { + BUG_ON(mr->type != FUSE_MR_DMABUF); + + /* Associate GPU scatter-gather table with DMA-buf file descriptor */ + fuse_dmabuf_set_sgt(&ent->dmabuf_ent, (struct fuse_refcnt_sgt *)mr->rdma_dmabuf.sgt); + + /* Get DMA-buf file descriptor for userspace */ + mr->rdma_dmabuf.dmabuf_fd = ent->dmabuf_ent.fd; + } else { + mr->type = FUSE_MR_NONE; + } +} + static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_req *req) { @@ -844,6 +865,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_ring *ring = queue->ring; int err; struct fuse_uring_req_header *headers = NULL; + struct fuse_mr *mr; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { @@ -856,6 +878,8 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, if (WARN_ON(req->in.h.unique == 0)) return err; + fuse_uring_prepare_mr(ent, req); + /* copy fuse_in_header */ if (ent->header_pages) { headers = kmap_local_page( @@ -863,6 +887,10 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, memcpy(&headers->in_out, &req->in.h, sizeof(req->in.h)); + /* copy MR info located after header in same page */ + mr = (struct fuse_mr *)(headers + 1); + memcpy(&mr->mr_in, &req->args->mr.mr_in, sizeof(struct fuse_mr_in)); + err = fuse_uring_args_to_ring_pages(ring, req, ent, headers); kunmap_local(headers); } else { @@ -874,6 +902,10 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, } err = copy_to_user(&ent->headers->in_out, &req->in.h, sizeof(req->in.h)); + if (!err) { + err = copy_to_user(ent->headers + 1, &req->args->mr.mr_in, + sizeof(struct fuse_mr_in)); + } if (err) err = -EFAULT; } @@ -1187,15 +1219,15 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, struct iov_iter iter; ssize_t ret; - if (sqe->len != FUSE_URING_IOV_SEGS) + if (sqe->len < FUSE_URING_IOV_SEGS_COMPAT || sqe->len > FUSE_URING_IOV_SEGS) return -EINVAL; /* * Direction for buffer access will actually be READ and WRITE, * using write for the import should include READ access as well. */ - ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, - FUSE_URING_IOV_SEGS, &iov, &iter); + ret = import_iovec(WRITE, uiov, sqe->len, + sqe->len, &iov, &iter); if (ret < 0) return ret; @@ -1296,14 +1328,32 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + /* Payload MR is optional - iov[2] indicates existence, actual location follows header */ + if (cmd->sqe->len > FUSE_RING_PAYLOAD_MR_PG) + ent->payload_mr = iov[FUSE_RING_PAYLOAD_MR_PG].iov_base; + else + ent->payload_mr = NULL; + err = fuse_uring_pin_pages(ent); if (err) { - kfree(ent); - return ERR_PTR(err); + goto out; + } + + err = fuse_create_dmabuf(&ent->dmabuf_ent, payload_size); + if (err) { + goto dmabuf_out; } atomic_inc(&ring->queue_refs); return ent; + +dmabuf_out: + io_pages_free(&ent->header_pages, ent->nr_header_pages); + io_pages_free(&ent->payload_pages, ent->nr_payload_pages); + +out: + kfree(ent); + return ERR_PTR(err); } /* diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 948e9c01aeaef5..18be9ab7aec015 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -7,6 +7,7 @@ #ifndef _FS_FUSE_DEV_URING_I_H #define _FS_FUSE_DEV_URING_I_H +#include "gds.h" #include "fuse_i.h" #ifdef CONFIG_FUSE_IO_URING @@ -45,6 +46,10 @@ struct fuse_ring_ent { void __user *payload; struct page **payload_pages; int nr_payload_pages; + void __user *payload_mr; + + /* DMA-buf object info for GPU Direct Storage */ + struct fuse_dmabuf_entry dmabuf_ent; /* the ring queue that owns the request */ struct fuse_ring_queue *queue; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 55d018283f0201..6ca5f45fe3276d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -8,6 +8,7 @@ #include "fuse_i.h" #include "fuse_dlm_cache.h" +#include "gds.h" #include #include @@ -682,9 +683,19 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->in_numargs = 1; args->in_args[0].size = sizeof(ia->read.in); args->in_args[0].value = &ia->read.in; + + if (args->is_gds) { + /* file data is passed through RDMA, the read size is returned in out.args[0] */ + args->user_pages = false; + args->out_pages = false; + args->out_args[0].size = sizeof(ia->read.out); + args->out_args[0].value = &ia->read.out; + } + else { + args->out_args[0].size = count; + } args->out_argvar = true; args->out_numargs = 1; - args->out_args[0].size = count; } static void fuse_release_user_pages(struct fuse_args_pages *ap, @@ -846,6 +857,8 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; + struct fuse_args *args = &ia->ap.args; + int err; fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { @@ -856,7 +869,14 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, if (ia->io->async) return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(fm, &ia->ap.args); + err = fuse_simple_request(fm, &ia->ap.args); + + /* Handle different return values of fuse_simple_request: regular read returns + * bytes read, GDS read returns sizeof(ia->read.out) - normalize to bytes read */ + if (args->is_gds && err == sizeof(ia->read.out)) { + err = ia->read.out.size; + } + return err; } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -1099,13 +1119,23 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, ia->write.in.size = count; args->opcode = FUSE_WRITE; args->nodeid = ff->nodeid; - args->in_numargs = 2; + args->in_numargs = 1; if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); args->in_args[0].value = &ia->write.in; - args->in_args[1].size = count; + + if (args->is_gds) { + /* skip data copy */ + args->user_pages = false; + args->in_pages = false; + } + else { + args->in_numargs++; + args->in_args[1].size = count; + } + args->out_numargs = 1; args->out_args[0].size = sizeof(ia->write.out); args->out_args[0].value = &ia->write.out; @@ -1596,8 +1626,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, struct fuse_io_args *ia; unsigned int max_pages; bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; + bool is_gds = false; max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); if (!ia) return -ENOMEM; @@ -1636,6 +1668,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (err && !nbytes) break; + if (fuse_is_gds_buffer(&ia->ap)) { + is_gds = true; + err = fuse_gds_map_sg(fc, write, ia); + if (err) { + fuse_release_user_pages(&ia->ap, io->should_dirty); + break; + } + } + if (write) { if (!capable(CAP_FSETID)) ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; @@ -1645,6 +1686,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(ia, pos, nbytes, owner); } + if (is_gds) { + fuse_gds_unmap_sg(fc, write, ia); + } + if (!io->async || nres < 0) { fuse_release_user_pages(&ia->ap, io->should_dirty); fuse_io_free(ia); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4f0474e2e31def..de614f5c5d2740 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -32,6 +32,7 @@ #include #include #include "fuse_dlm_cache.h" +// #include "gds.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -315,8 +316,10 @@ struct fuse_args { bool may_block:1; bool is_ext:1; bool is_pinned:1; + bool is_gds:1; struct fuse_in_arg in_args[4]; struct fuse_arg out_args[2]; + struct fuse_mr mr; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); }; @@ -866,6 +869,7 @@ struct fuse_conn { /* Is synchronous FUSE_INIT allowed? */ unsigned int sync_init:1; + unsigned int gds:1; /* Use io_uring for communication */ unsigned int io_uring; @@ -936,6 +940,9 @@ struct fuse_conn { /* The foffset alignment in PAGE */ unsigned int alignment_pages; + /* List of registered netdevs of GDS */ + spinlock_t gds_netdev_lock; + struct list_head gds_netdev_list; }; /* @@ -1106,6 +1113,7 @@ struct fuse_io_args { union { struct { struct fuse_read_in in; + struct fuse_gds_read_out out; u64 attr_ver; } read; struct { @@ -1200,7 +1208,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void __exit fuse_ctl_cleanup(void); +void fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing diff --git a/fs/fuse/gds.c b/fs/fuse/gds.c new file mode 100644 index 00000000000000..0fba6a17a0cee2 --- /dev/null +++ b/fs/fuse/gds.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2025 DataDirect Networks. + */ + +#include +#include +#include +#include +#include +#include +#include "gds.h" + +#define GDS_MOCK_TEST 0 + +/* NVIDIA GPU Direct Storage interface and operations */ + +static atomic_t nvfs_ops_refcnt = ATOMIC_INIT(0); +static struct nvfs_dma_rw_ops *nvfs_ops = NULL; + +static struct nvfs_dma_rw_ops* get_nvfs_dma_ops(void) +{ + struct nvfs_dma_rw_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(nvfs_ops); + if (ops) + atomic_inc(&nvfs_ops_refcnt); + rcu_read_unlock(); + return ops; +} + +static void put_nvfs_dma_ops(void) +{ + atomic_dec(&nvfs_ops_refcnt); +} + +int fuse_register_nvfs_dma_ops(struct nvfs_dma_rw_ops *ops) +{ + if (!ops) + return -EINVAL; + + rcu_assign_pointer(nvfs_ops, ops); + return 0; +} + +void fuse_unregister_nvfs_dma_ops(void) +{ + rcu_assign_pointer(nvfs_ops, NULL); + synchronize_rcu(); + while (atomic_read(&nvfs_ops_refcnt) > 0) + msleep(100); +} + +EXPORT_SYMBOL_GPL(fuse_register_nvfs_dma_ops); +EXPORT_SYMBOL_GPL(fuse_unregister_nvfs_dma_ops); + + +/* DMA-buf operations for GPU Direct Storage memory regions */ + +static int fuse_dmabuf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attach) +{ + struct fuse_dmabuf_entry *entry = (struct fuse_dmabuf_entry *)dmabuf->priv; + struct fuse_refcnt_sgt *sgt_ref; + + struct ib_umem_dmabuf *umem_dma_buf = (struct ib_umem_dmabuf *)attach->importer_priv; + + if (umem_dma_buf->umem.address != 0) { + return -EINVAL; + } + + if (umem_dma_buf->umem.length > dmabuf->size) { + return -EINVAL; + } + + sgt_ref = fuse_dmabuf_get_sgt(entry); + if (!sgt_ref) { + return -EINVAL; + } + + entry->length = umem_dma_buf->umem.length; + attach->priv = sgt_ref; + return 0; +} + +static void fuse_dmabuf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attach) +{ + struct fuse_refcnt_sgt *sgt_ref = (struct fuse_refcnt_sgt *)attach->priv; + kref_put(&sgt_ref->kref, fuse_dmabuf_release_sgt); +} + +static struct sg_table *fuse_dmabuf_map_dma(struct dma_buf_attachment *attach, + enum dma_data_direction direction) +{ + struct fuse_refcnt_sgt *sgt_ref = (struct fuse_refcnt_sgt *)attach->priv; + return &sgt_ref->sgt; +} + +static void fuse_dmabuf_unmap_dma(struct dma_buf_attachment *attach, + struct sg_table *sgt, + enum dma_data_direction direction) +{ +} + +static void fuse_dmabuf_release(struct dma_buf *dmabuf) +{ +} + +static const struct dma_buf_ops fuse_gds_dmabuf_ops = { + .attach = fuse_dmabuf_attach, + .detach = fuse_dmabuf_detach, + .map_dma_buf = fuse_dmabuf_map_dma, + .unmap_dma_buf = fuse_dmabuf_unmap_dma, + .release = fuse_dmabuf_release, +}; + + +int fuse_create_dmabuf(struct fuse_dmabuf_entry *ent, size_t size) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct dma_buf *dmabuf; + int dmabuf_fd; + + spin_lock_init(&ent->lock); + exp_info.ops = &fuse_gds_dmabuf_ops; + exp_info.size = size; + exp_info.flags = O_RDWR | O_CLOEXEC; + + exp_info.priv = ent; + exp_info.exp_name = "fuse_gds"; + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) { + return PTR_ERR(dmabuf); + } + ent->dmabuf = dmabuf; + + dmabuf_fd = dma_buf_fd(dmabuf, O_RDWR | O_CLOEXEC); + if (dmabuf_fd < 0) { + dma_buf_put(dmabuf); // double check + return dmabuf_fd; + } + ent->fd = dmabuf_fd; + return 0; +} + + +/* Reference-counted scatter-gather table management for DMA-buf entries */ + +void fuse_dmabuf_release_sgt(struct kref *kref) +{ + struct fuse_refcnt_sgt *sgt_ref = container_of(kref, struct fuse_refcnt_sgt, kref); + + sg_free_table(&sgt_ref->sgt); + kfree(sgt_ref); +} + +void fuse_dmabuf_set_sgt(struct fuse_dmabuf_entry *ent, struct fuse_refcnt_sgt *sgt_ref) +{ + struct fuse_refcnt_sgt *old_sgt_ref; + + spin_lock(&ent->lock); + old_sgt_ref = ent->sgt_ref; + ent->sgt_ref = sgt_ref; + spin_unlock(&ent->lock); + if (old_sgt_ref) + kref_put(&old_sgt_ref->kref, fuse_dmabuf_release_sgt); +} + +struct fuse_refcnt_sgt *fuse_dmabuf_get_sgt(struct fuse_dmabuf_entry *ent) +{ + struct fuse_refcnt_sgt *sgt_ref; + spin_lock(&ent->lock); + sgt_ref = ent->sgt_ref; + if (sgt_ref) + kref_get(&sgt_ref->kref); + spin_unlock(&ent->lock); + return sgt_ref; +} + +void fuse_dmabuf_clear_sgt(struct fuse_dmabuf_entry *ent) +{ + fuse_dmabuf_set_sgt(ent, NULL); +} + + +/* Network device registration management for GDS operations */ + +int fuse_dmabuf_register_netdev(struct fuse_conn *fc, const char *pci_dev_name) +{ + struct fuse_dmabuf_netdev *entry, *new_entry; + struct pci_dev *pci_dev; + int domain, bus, dev, fn; + int ret = 0; + + if (sscanf(pci_dev_name, "%x:%x:%x.%d", &domain, &bus, &dev, &fn) != 4) { + return -EINVAL; + } + + pci_dev = pci_get_domain_bus_and_slot(domain, bus, PCI_DEVFN(dev, fn)); + if (!pci_dev) { + return -ENODEV; + } + + new_entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!new_entry) { + ret = -ENOMEM; + goto out; + } + new_entry->netdev = pci_dev; + + spin_lock(&fc->gds_netdev_lock); + list_for_each_entry_rcu(entry, &fc->gds_netdev_list, list) { + if (entry->netdev == pci_dev) { + ret = -EINVAL; + goto out_unlock; + } + } + list_add_rcu(&new_entry->list, &fc->gds_netdev_list); + spin_unlock(&fc->gds_netdev_lock); + return 0; + +out_unlock: + spin_unlock(&fc->gds_netdev_lock); + kfree(new_entry); +out: + pci_dev_put(pci_dev); + return ret; +} + +int fuse_dmabuf_unregister_netdev(struct fuse_conn *fc, const char *pci_dev_name) +{ + struct fuse_dmabuf_netdev *entry; + struct pci_dev *pci_dev; + int domain, bus, dev, fn; + bool found = false; + + if (sscanf(pci_dev_name, "%x:%x:%x.%d", &domain, &bus, &dev, &fn) != 4) { + return -EINVAL; + } + + pci_dev = pci_get_domain_bus_and_slot(domain, bus, PCI_DEVFN(dev, fn)); + if (!pci_dev) { + return -ENODEV; + } + + spin_lock(&fc->gds_netdev_lock); + list_for_each_entry_rcu(entry, &fc->gds_netdev_list, list) { + if (entry->netdev == pci_dev) { + found = true; + list_del_rcu(&entry->list); + pci_dev_put(entry->netdev); + } + } + spin_unlock(&fc->gds_netdev_lock); + pci_dev_put(pci_dev); + + if (!found) { + return -ENOENT; + } + kfree_rcu(entry, rcu); + return 0; +} + +void fuse_dmabuf_cleanup_netdev(struct fuse_conn *fc) +{ + struct fuse_dmabuf_netdev *entry; + + spin_lock(&fc->gds_netdev_lock); + list_for_each_entry_rcu(entry, &fc->gds_netdev_list, list) { + list_del_rcu(&entry->list); + pci_dev_put(entry->netdev); + kfree_rcu(entry, rcu); + } + spin_unlock(&fc->gds_netdev_lock); +} + + +/* GPU buffer detection and DMA scatter-gather mapping operations */ + +static bool nvfs_dma_ops_is_gds_page(struct page *page) +{ + struct nvfs_dma_rw_ops *ops = get_nvfs_dma_ops(); + if (ops) { + bool ret = ops->nvfs_is_gpu_page(page); + put_nvfs_dma_ops(); + return ret; + } + return false; +} + +bool fuse_is_gds_buffer(struct fuse_args_pages *ap) +{ + struct page **pages = ap->pages; + unsigned int num_pages = ap->num_pages; + +#if GDS_MOCK_TEST + return true; +#else + return ap->args.user_pages && num_pages > 1 && nvfs_dma_ops_is_gds_page(pages[0]); +#endif +} + + +static int nvfs_dma_ops_dma_map_sg(struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs) +{ + struct nvfs_dma_rw_ops *ops = get_nvfs_dma_ops(); + int err = -EIO; + if (ops) { + err = ops->nvfs_dma_map_sg_attrs(device, sglist, nents, dma_dir, attrs); + put_nvfs_dma_ops(); + } + return err ? -EIO : 0; +} + +static int nvfs_dma_ops_dma_unmap_sg(struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir) +{ + struct nvfs_dma_rw_ops *ops = get_nvfs_dma_ops(); + int err = -EIO; + if (ops) { + err = ops->nvfs_dma_unmap_sg(device, sglist, nents, dma_dir); + put_nvfs_dma_ops(); + } + return err ? -EIO : 0; +} + +static int nvfs_dma_ops_dma_map_sg_mock(struct device *dev, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir) +{ + const struct dma_map_ops *ops = dev->dma_ops; + dma_set_min_align_mask(dev, 0xfff); /* TEMP: for testing */ + + int new_nents = dma_map_sg(dev, sglist, nents, dma_dir); + if (new_nents == 0) { + return -ENOMEM; + } + return 0; +} + +static int nvfs_dma_ops_dma_unmap_sg_mock(struct device *dev, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir) +{ + dma_unmap_sg(dev, sglist, nents, dma_dir); + return 0; +} + +int fuse_gds_map_sg(struct fuse_conn *fc, int write, struct fuse_io_args *ia) +{ + struct fuse_refcnt_sgt *sgt_ref; + struct scatterlist *sg; + struct pci_dev *dev = NULL; + struct fuse_dmabuf_netdev *entry; + struct fuse_mr_in *mr_in = &ia->ap.args.mr.mr_in; + int err = 0; + unsigned int i; + + if (!ia->ap.num_pages) { + return -EINVAL; + } + + // TEMP: use only the first netdev + rcu_read_lock(); + list_for_each_entry_rcu(entry, &fc->gds_netdev_list, list) { + dev = pci_dev_get(entry->netdev); + break; + } + rcu_read_unlock(); + if (!dev) { + return -ENODEV; + } + + sgt_ref = kzalloc(sizeof(*sgt_ref), GFP_KERNEL); + if (!sgt_ref) { + err = -ENOMEM; + goto out; + } + kref_init(&sgt_ref->kref); + + if (sg_alloc_table(&sgt_ref->sgt, ia->ap.num_pages, GFP_KERNEL)) { + err = -ENOMEM; + goto out_sgt; + } + + sg = sgt_ref->sgt.sgl; + for(i = 0; i < ia->ap.num_pages; i++) { + sg_set_page(sg, ia->ap.pages[i], ia->ap.descs[i].length + ia->ap.descs[i].offset, 0); + sg = sg_next(sg); + } + +#if GDS_MOCK_TEST + err = nvfs_dma_ops_dma_map_sg_mock(dev->dev.parent, sgt_ref->sgt.sgl, ia->ap.num_pages, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +#else + err = nvfs_dma_ops_dma_map_sg(dev->dev.parent, sgt_ref->sgt.sgl, ia->ap.num_pages, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE, 0); +#endif + + ia->ap.args.is_gds = 1; + mr_in->type = FUSE_MR_DMABUF; + mr_in->rdma_dmabuf.sgt = (uint64_t)sgt_ref; + mr_in->rdma_dmabuf.iova_offset = ia->ap.descs[0].offset; + if (write) + ia->write.in.write_flags |= FUSE_WRITE_GDS; + else + ia->read.in.read_flags |= FUSE_READ_GDS; /* not used for now */ + return err; + +out_sgt: + kfree(sgt_ref); +out: + pci_dev_put(dev); + return err; +} + +int fuse_gds_unmap_sg(struct fuse_conn *fc, int write, struct fuse_io_args *ia) +{ + struct sg_table *sgt = (struct sg_table *)ia->ap.args.mr.mr_in.rdma_dmabuf.sgt; + struct pci_dev *dev = NULL; + struct fuse_dmabuf_netdev *entry; + int err; + + /* TODO: handle netdev list change between map and unmap */ + rcu_read_lock(); + list_for_each_entry_rcu(entry, &fc->gds_netdev_list, list) { + dev = pci_dev_get(entry->netdev); + break; + } + rcu_read_unlock(); + if (!dev) + return -ENODEV; + +#if GDS_MOCK_TEST + err = nvfs_dma_ops_dma_unmap_sg_mock(dev->dev.parent, sgt->sgl, sgt->nents, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +#else + err = nvfs_dma_ops_dma_unmap_sg(dev->dev.parent, sgt->sgl, sgt->nents, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +#endif + + pci_dev_put(dev); + return err; +} + +MODULE_IMPORT_NS(DMA_BUF); diff --git a/fs/fuse/gds.h b/fs/fuse/gds.h new file mode 100644 index 00000000000000..80a3c15f5c5d65 --- /dev/null +++ b/fs/fuse/gds.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2025 DataDirect Networks. + */ + +#ifndef _FS_FUSE_GDS_H +#define _FS_FUSE_GDS_H + +#include +#include +#include +#include +#include "fuse_i.h" + +struct request; +struct nvfs_rdma_info; + +struct fuse_dmabuf_netdev { + struct pci_dev *netdev; + struct list_head list; + struct rcu_head rcu; +}; + +struct fuse_refcnt_sgt { + struct sg_table sgt; + struct kref kref; +}; + +struct fuse_dmabuf_entry { + spinlock_t lock; + int fd; + int iova_offset; + size_t length; /* length of the current read/write operation (TODO) */ + struct dma_buf *dmabuf; + struct fuse_refcnt_sgt *sgt_ref; +}; + +typedef struct fuse_mr_dmabuf nvfs_rdma_dmabuf; +typedef struct fuse_mr_rdma_info nvfs_rdma_info; + +struct nvfs_dma_rw_ops { + unsigned long long ft_bmap; /* feature bitmap */ + + int (*nvfs_blk_rq_map_sg) (struct request_queue *q, + struct request *req, + struct scatterlist *sglist); + + int (*nvfs_dma_map_sg_attrs) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs); + + int (*nvfs_dma_unmap_sg) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir); + bool (*nvfs_is_gpu_page) (struct page *); + unsigned int (*nvfs_gpu_index) (struct page *page); + unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int dev_index); + int (*nvfs_get_gpu_sglist_rdma_info) (struct scatterlist *sglist, + int nents, + struct nvfs_rdma_info *rdma_infop); +}; + +extern struct fuse_dmabuf fuse_dmabuf; + + +void fuse_dmabuf_set_sgt(struct fuse_dmabuf_entry *ent, struct fuse_refcnt_sgt *sgt_ref); +struct fuse_refcnt_sgt *fuse_dmabuf_get_sgt(struct fuse_dmabuf_entry *ent); +void fuse_dmabuf_clear_sgt(struct fuse_dmabuf_entry *ent); +void fuse_dmabuf_release_sgt(struct kref *kref); + +bool fuse_is_gds_buffer(struct fuse_args_pages *ap); +int fuse_gds_map_sg(struct fuse_conn *fc, int write, struct fuse_io_args *ia); +int fuse_gds_unmap_sg(struct fuse_conn *fc, int write, struct fuse_io_args *ia); + +int fuse_dmabuf_register_netdev(struct fuse_conn *fc, const char *pci_dev_name); +int fuse_dmabuf_unregister_netdev(struct fuse_conn *fc, const char *pci_dev_name); +void fuse_dmabuf_cleanup_netdev(struct fuse_conn *fc); +int fuse_create_dmabuf(struct fuse_dmabuf_entry *ent, size_t size); + +int fuse_register_nvfs_dma_ops(struct nvfs_dma_rw_ops *ops); +void fuse_unregister_nvfs_dma_ops(void); +#endif diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 61962fd615857a..2a33b56f6e0aa0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -10,6 +10,7 @@ #include "fuse_dlm_cache.h" #include "fuse_dev_i.h" #include "dev_uring_i.h" +#include "gds.h" #include #include @@ -1045,6 +1046,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); + + spin_lock_init(&fc->gds_netdev_lock); + INIT_LIST_HEAD(&fc->gds_netdev_list); fm->fc = fc; } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -1440,6 +1444,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->inval_inode_entries = 1; if (flags & FUSE_EXPIRE_INODE_ENTRY) fc->expire_inode_entries = 1; + if (flags & FUSE_GDS_SUPPORT) + fc->gds = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1553,6 +1559,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); + fuse_dmabuf_cleanup_netdev(fc); kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 52ac7a3d266d46..95aee3b72c9e80 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -261,7 +261,8 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 39 +// todo +#define FUSE_KERNEL_MINOR_VERSION 40 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -482,6 +483,7 @@ struct fuse_file_lock { #define FUSE_URING_REDUCED_Q (1ULL << 59) #define FUSE_INVAL_INODE_ENTRY (1ULL << 60) #define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) +#define FUSE_GDS_SUPPORT (1ULL << 62) /** * CUSE INIT request/reply flags @@ -512,18 +514,23 @@ struct fuse_file_lock { * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed * FUSE_WRITE_LOCKOWNER: lock_owner field is valid * FUSE_WRITE_KILL_SUIDGID: kill suid and sgid bits + * FUSE_WRITE_GDS: write operation using GPU Direct Storage */ #define FUSE_WRITE_CACHE (1 << 0) #define FUSE_WRITE_LOCKOWNER (1 << 1) #define FUSE_WRITE_KILL_SUIDGID (1 << 2) +#define FUSE_WRITE_GDS (1 << 20) /* Obsolete alias; this flag implies killing suid/sgid only. */ #define FUSE_WRITE_KILL_PRIV FUSE_WRITE_KILL_SUIDGID /** * Read flags + * + * FUSE_READ_GDS: read operation using GPU Direct Storage */ #define FUSE_READ_LOCKOWNER (1 << 1) +#define FUSE_READ_GDS (1 << 20) /** * Ioctl flags @@ -675,6 +682,8 @@ enum fuse_notify_code { FUSE_NOTIFY_STORE = 4, FUSE_NOTIFY_RETRIEVE = 5, FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_REGISTER_GDS_NETDEV = 100, + FUSE_NOTIFY_UNREGISTER_GDS_NETDEV = 101, FUSE_NOTIFY_CODE_MAX, }; @@ -828,6 +837,10 @@ struct fuse_read_in { uint32_t padding; }; +struct fuse_gds_read_out { + uint64_t size; +}; + #define FUSE_COMPAT_WRITE_IN_SIZE 24 struct fuse_write_in { @@ -845,6 +858,43 @@ struct fuse_write_out { uint32_t padding; }; +enum fuse_mr_type { + FUSE_MR_NONE = 0, + FUSE_MR_DMABUF = 1, + FUSE_MR_RDMAINFO = 2, +}; + +struct fuse_mr_dmabuf +{ + uint32_t dmabuf_fd; + uint32_t iova_offset; + uint64_t sgt; +}; + +struct fuse_mr_rdma_info +{ + uint8_t version; /* to support future changes to structure */ + uint8_t flags; /* if bit 0 != 0, then gid field is valid */ + uint16_t lid; /* subnet local identifier of the client node port */ + uint32_t qp_num; /* QP number of DCT on the client node */ + uint64_t rem_vaddr; /* remote address */ + uint32_t size; + uint32_t rkey; + uint64_t gid[2]; /* 16-byte global identifier of the client node port */ + uint32_t dc_key; +}; + +struct fuse_mr { + struct fuse_mr_in { + enum fuse_mr_type type; + union { + struct fuse_mr_dmabuf rdma_dmabuf; + struct fuse_mr_rdma_info rdma_info; + }; + } mr_in; + void *user_mr; +}; + #define FUSE_COMPAT_STATFS_SIZE 48 struct fuse_statfs_out {