From 49cad57274551baabc7071c252af0b2dcf9638a0 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 2 Apr 2025 16:17:23 +0200 Subject: [PATCH 01/46] fuse: fine-grained request ftraces Rename trace_fuse_request_send to trace_fuse_request_enqueue Add trace_fuse_request_send Add trace_fuse_request_bg_enqueue Add trace_fuse_request_enqueue This helps to track entire request time and time in different queues. Signed-off-by: Bernd Schubert (imported from commit 4a7f14274fc223e50b36f428e1b6acd661b73f53) --- fs/fuse/dev.c | 6 ++++++ fs/fuse/dev_uring.c | 1 + fs/fuse/fuse_trace.h | 51 +++++++++++++++++++++++++++++++++----------- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0b0241f47170d4..c339ea2c87ade1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -418,6 +418,9 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + + /* enqueue, as it is send to "fiq->ops queue" */ + trace_fuse_request_enqueue(req); fiq->ops->send_req(fiq, req); } @@ -732,6 +735,8 @@ static int fuse_request_queue_background(struct fuse_req *req) } __set_bit(FR_ISREPLY, &req->flags); + trace_fuse_request_bg_enqueue(req); + #ifdef CONFIG_FUSE_IO_URING if (fuse_uring_ready(fc)) return fuse_request_queue_background_uring(fc, req); @@ -1467,6 +1472,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); spin_unlock(&fiq->lock); + trace_fuse_request_send(req); args = req->args; reqsize = req->in.h.len; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 3a38b61aac26f7..5a22328c078a73 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1207,6 +1207,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); + trace_fuse_request_send(ent->fuse_req); io_uring_cmd_done(cmd, ret, issue_flags); } diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index bbe9ddd8c71696..393c630e772635 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -77,30 +77,55 @@ OPCODES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -TRACE_EVENT(fuse_request_send, +#define FUSE_REQ_TRACE_FIELDS \ + __field(dev_t, connection) \ + __field(uint64_t, unique) \ + __field(enum fuse_opcode, opcode) \ + __field(uint32_t, len) \ + +#define FUSE_REQ_TRACE_ASSIGN(req) \ + do { \ + __entry->connection = req->fm->fc->dev; \ + __entry->unique = req->in.h.unique; \ + __entry->opcode = req->in.h.opcode; \ + __entry->len = req->in.h.len; \ + } while (0) + + +TRACE_EVENT(fuse_request_enqueue, TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_bg_enqueue, + TP_PROTO(const struct fuse_req *req), TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), - TP_STRUCT__entry( - __field(dev_t, connection) - __field(uint64_t, unique) - __field(enum fuse_opcode, opcode) - __field(uint32_t, len) - ), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); - TP_fast_assign( - __entry->connection = req->fm->fc->dev; - __entry->unique = req->in.h.unique; - __entry->opcode = req->in.h.opcode; - __entry->len = req->in.h.len; - ), +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), TP_printk("connection %u req %llu opcode %u (%s) len %u ", __entry->connection, __entry->unique, __entry->opcode, __print_symbolic(__entry->opcode, OPCODES), __entry->len) ); + TRACE_EVENT(fuse_request_end, TP_PROTO(const struct fuse_req *req), From c8f3d45312d137f2185c08ca87004ade98c38b7d Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 8 Jan 2025 16:10:27 +0100 Subject: [PATCH 02/46] fuse: {uring} Pin the user buffer This is to allow copying into the buffer from the application without the need to copy in ring context (and with that, the need that the ring task is active in kernel space). Signed-off-by: Bernd Schubert (cherry picked from commit 43d1a63dec17d928609fb9725ac4ab9d6e09803f) (imported from commit ea01f94a55f91fa48cb3a0304b1e41a92707539a) --- fs/fuse/dev.c | 9 ++ fs/fuse/dev_uring.c | 209 +++++++++++++++++++++++++++++++++++++++--- fs/fuse/dev_uring_i.h | 4 + fs/fuse/fuse_dev_i.h | 2 + 4 files changed, 211 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c339ea2c87ade1..1f107b160778fb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -917,6 +917,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs++; } + } else if (cs->ring.pages) { + cs->pg = cs->ring.pages[cs->ring.page_idx++]; + /* + * non stricly needed, just to avoid a uring exception in + * fuse_copy_finish + */ + get_page(cs->pg); + cs->len = PAGE_SIZE; + cs->offset = 0; } else { size_t off; err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 5a22328c078a73..e24b87f1df745d 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -11,6 +11,7 @@ #include #include +#include static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); @@ -18,6 +19,8 @@ MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ +#define FUSE_RING_HEADER_PG 0 +#define FUSE_RING_PAYLOAD_PG 1 bool fuse_uring_enabled(void) @@ -142,6 +145,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +/* + * Copy from memmap.c, should be exported + */ +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) { struct fuse_ring_ent *ent; @@ -208,6 +226,9 @@ void fuse_uring_destruct(struct fuse_conn *fc) list_for_each_entry_safe(ent, next, &queue->ent_released, list) { list_del_init(&ent->list); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + io_pages_free(&ent->payload_pages, + ent->nr_payload_pages); kfree(ent); } @@ -598,12 +619,66 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, fuse_copy_init(&cs, false, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); fuse_copy_finish(&cs); return err; } +/* + * Copy data from the req to the ring buffer + * In order to be able to write into the ring buffer from the application, + * i.e. to avoid io_uring_cmd_complete_in_task(), the header needs to be + * pinned as well. + */ +static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent, + struct fuse_uring_req_header *headers) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + fuse_copy_init(&cs, 1, NULL); + cs.is_uring = 1; + cs.req = req; + cs.ring.pages = ent->payload_pages; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + memcpy(&headers->op_in, in_args->value, in_args->size); + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + return err; +} + /* * Copy data from the req to the ring buffer */ @@ -630,6 +705,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_init(&cs, true, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; if (num_args > 0) { /* @@ -670,6 +747,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; int err; + struct fuse_uring_req_header *headers = NULL; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { @@ -682,22 +760,29 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, if (WARN_ON(req->in.h.unique == 0)) return err; - /* copy the request */ - err = fuse_uring_args_to_ring(ring, req, ent); - if (unlikely(err)) { - pr_info_ratelimited("Copy to ring failed: %d\n", err); - return err; - } - /* copy fuse_in_header */ - err = copy_to_user(&ent->headers->in_out, &req->in.h, - sizeof(req->in.h)); - if (err) { - err = -EFAULT; - return err; + if (ent->header_pages) { + headers = kmap_local_page( + ent->header_pages[FUSE_RING_HEADER_PG]); + + memcpy(&headers->in_out, &req->in.h, sizeof(req->in.h)); + + err = fuse_uring_args_to_ring_pages(ring, req, ent, headers); + kunmap_local(headers); + } else { + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) + err = -EFAULT; } - return 0; + return err; } static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, @@ -1006,6 +1091,45 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, } } +/* + * Copy from memmap.c, should be exported there + */ +static struct page **io_pin_pages(unsigned long uaddr, unsigned long len, + int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + /* * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] * the payload @@ -1032,6 +1156,59 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, return 0; } +static int fuse_uring_pin_pages(struct fuse_ring_ent *ent) +{ + struct fuse_ring *ring = ent->queue->ring; + int err; + + /* + * This needs to do locked memory accounting, for now privileged servers + * only. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* Pin header pages */ + if (!PAGE_ALIGNED(ent->headers)) { + pr_info_ratelimited("ent->headers is not page-aligned: %p\n", + ent->headers); + return -EINVAL; + } + + ent->header_pages = io_pin_pages((unsigned long)ent->headers, + sizeof(struct fuse_uring_req_header), + &ent->nr_header_pages); + if (IS_ERR(ent->header_pages)) { + err = PTR_ERR(ent->header_pages); + pr_info_ratelimited("Failed to pin header pages, err=%d\n", + err); + ent->header_pages = NULL; + return err; + } + + if (ent->nr_header_pages != 1) { + pr_info_ratelimited("Header pages not pinned as one page\n"); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->header_pages = NULL; + return -EINVAL; + } + + /* Pin payload pages */ + ent->payload_pages = io_pin_pages((unsigned long)ent->payload, + ring->max_payload_sz, + &ent->nr_payload_pages); + if (IS_ERR(ent->payload_pages)) { + err = PTR_ERR(ent->payload_pages); + pr_info_ratelimited("Failed to pin payload pages, err=%d\n", + err); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->payload_pages = NULL; + return err; + } + + return 0; +} + static struct fuse_ring_ent * fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_queue *queue) @@ -1073,6 +1250,12 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + err = fuse_uring_pin_pages(ent); + if (err) { + kfree(ent); + return ERR_PTR(err); + } + atomic_inc(&ring->queue_refs); return ent; } diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 51a563922ce141..c89c7dc27c76c1 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -40,7 +40,11 @@ enum fuse_ring_req_state { struct fuse_ring_ent { /* userspace buffer */ struct fuse_uring_req_header __user *headers; + struct page **header_pages; + int nr_header_pages; void __user *payload; + struct page **payload_pages; + int nr_payload_pages; /* the ring queue that owns the request */ struct fuse_ring_queue *queue; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 134bf44aff0d39..4037fd7bdeee66 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -36,6 +36,8 @@ struct fuse_copy_state { bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ + struct page **pages; + int page_idx; } ring; }; From 8a20f0b2b11d0a5d4e338ed22212bc1a3cdb2290 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 17 Jan 2025 22:06:30 +0100 Subject: [PATCH 03/46] fuse: {io-uring] Avoid complete-in-task if pinned pages are used If pinned pages are used the application can write into these pages and io_uring_cmd_complete_in_task() is not needed. Signed-off-by: Bernd Schubert (imported from commit 5f0264c1dc0100e274f3db37511bba0d8043de1c) --- fs/fuse/dev_uring.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index e24b87f1df745d..d5737245516b01 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1438,12 +1438,31 @@ static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) return queue; } -static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) { struct io_uring_cmd *cmd = ent->cmd; - uring_cmd_set_ring_ent(cmd, ent); - io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + /* + * Task needed when pages are not pinned as the application doing IO + * is not allowed to write into fuse-server pages. + * Additionally for IO through io-uring as issue flags are unknown then. + * backgrounds requests might hold spin-locks, that conflict with + * io_uring_cmd_done() mutex lock. + */ + if (!ent->header_pages || current->io_uring || bg) { + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + } else { + int err = fuse_uring_prepare_send(ent, ent->fuse_req); + struct fuse_ring_queue *queue = ent->queue; + + if (err) { + fuse_uring_next_fuse_req(ent, queue, + IO_URING_F_UNLOCKED); + return; + } + fuse_uring_send(ent, cmd, 0, IO_URING_F_UNLOCKED); + } } /* queue a fuse request and send it if a ring entry is available */ @@ -1478,7 +1497,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) spin_unlock(&queue->lock); if (ent) - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, false); return; @@ -1531,7 +1550,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fuse_uring_add_req_to_ring_ent(ent, req); spin_unlock(&queue->lock); - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, true); } else { spin_unlock(&queue->lock); } From 1cb8f2d520f6aacc6580fef766a396bab6802307 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 7 May 2025 23:39:00 +0200 Subject: [PATCH 04/46] fuse: Use fuser-server provided read-ahead for CAP_SYS_ADMIN readhead is currently limited to bdi->ra_pages. One can change that after the mount with something like minor=$(stat -c "%d" /path/to/fuse) echo 1024 > /sys/class/bdi/0:$(minor)/read_ahead_kb Issue is that fuse-server cannot do that from its ->init method, as it has to know about device minor, which blocks before init is complete. Fuse already sets the bdi value, but upper limit is the current bdi value. For CAP_SYS_ADMIN we can allow higher values. Signed-off-by: Bernd Schubert (imported from commit 763c96da4bd6d1bb95d8e6bb7fd352389f3a17b9) --- fs/fuse/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c795abe47a4f4a..093310f669b522 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1464,7 +1464,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, init_server_timeout(fc, timeout); - fm->sb->s_bdi->ra_pages = + if (CAP_SYS_ADMIN) + fm->sb->s_bdi->ra_pages = ra_pages; + else + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; From 7c5216b61b38d42167c559a044ffc13733a39c2c Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 8 Apr 2025 16:44:55 +0200 Subject: [PATCH 05/46] fuse: Increase the default max pages limit to 8182 Due to user buffer misalignent we actually need one page more, i.e. 1025 instead of 1024, will be handled differently. For now we just bump up the max. (imported from commit 3f71501c9c4702ba976145ff15c4a053ecd1a3ee) --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 093310f669b522..e0f3f1ab08a1f4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -39,7 +39,7 @@ DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); -unsigned int fuse_max_pages_limit = 256; +unsigned int fuse_max_pages_limit = 4097; /* default is no timeout */ unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; From 9e44162e70c21af8866814cc12f4db3d3a478f44 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 20 Jun 2025 17:34:53 +0200 Subject: [PATCH 06/46] fuse: add DLM_LOCK opcode When having writeback cache enabled it is beneficial for data consistency to communicate to the FUSE server when the kernel prepares a page for caching. This lets the FUSE server react and lock the page. Additionally the kernel lets the FUSE server decide how much data it locks by the same call and keeps the given information in the dlm lock management. If the feature is not supported it will be disabled after first unsuccessful use. - Add DLM_LOCK fuse opcode - Add cache page lock caching for writeback cache functionality. This means sending out a FUSE call whenever the kernel prepares a page for writeback cache. The kernel will manage the cache so that it will keep track of already acquired locks. (except for the case that is documented in the code) - Use rb-trees for the management of the already 'locked' page ranges - Use rw_semaphore for synchronization in fuse_dlm_cache (imported from commit 287c8840b60d5cdcf806b16e8cc5722f2dbf0738) --- fs/fuse/Makefile | 2 +- fs/fuse/dir.c | 6 + fs/fuse/file.c | 13 + fs/fuse/fuse_dlm_cache.c | 551 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_dlm_cache.h | 50 ++++ fs/fuse/fuse_i.h | 18 ++ fs/fuse/fuse_trace.h | 1 + fs/fuse/inode.c | 11 + include/uapi/linux/fuse.h | 36 +++ 9 files changed, 687 insertions(+), 1 deletion(-) create mode 100644 fs/fuse/fuse_dlm_cache.c create mode 100644 fs/fuse/fuse_dlm_cache.h diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 22ad9538dfc4b8..64bc8682ae9659 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := trace.o # put trace.o first so we see ftrace errors sooner -fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 7ac6b232ef1232..c1179ce8fc96b2 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -6,6 +6,7 @@ See the file COPYING. */ +#include "fuse_dlm_cache.h" #include "fuse_i.h" #include @@ -2181,6 +2182,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * truncation has already been done by OPEN. But still * need to truncate page cache. */ + if (fc->dlm && fc->writeback_cache) + fuse_dlm_cache_release_locks(fi); i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; @@ -2286,6 +2289,9 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (fc->dlm && fc->writeback_cache) + fuse_dlm_unlock_range(fi, outarg.attr.size & PAGE_MASK, -1); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 676fd9856bfbf3..63b45e74356743 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -1489,6 +1490,17 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!fc->handle_killpriv_v2 || !setattr_should_drop_suidgid(idmap, file_inode(file))) writeback = true; + /* if we have dlm support acquire the lock for the area + * we are writing into */ + if (fc->dlm) { + /* note that a file opened with O_APPEND will have relative values + * in ki_pos. This code is here for convenience and for libfuse overlay test. + * Filesystems should handle O_APPEND with 'direct io' to additionally + * get the performance benefits of 'parallel direct writes'. */ + loff_t pos = file->f_flags & O_APPEND ? i_size_read(inode) + iocb->ki_pos : iocb->ki_pos; + size_t length = iov_iter_count(from); + fuse_get_dlm_write_lock(file, pos, length); + } } inode_lock(inode); @@ -3206,6 +3218,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); + fuse_dlm_cache_init(fi); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c new file mode 100644 index 00000000000000..ea947f34a9f70a --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FUSE page lock cache implementation + */ +#include "fuse_i.h" +#include "fuse_dlm_cache.h" + +#include +#include +#include +#include + + +/* A range of pages with a lock */ +struct fuse_dlm_range { + /* Interval tree node */ + struct rb_node rb; + /* Start page offset (inclusive) */ + pgoff_t start; + /* End page offset (inclusive) */ + pgoff_t end; + /* Subtree end value for interval tree */ + pgoff_t __subtree_end; + /* Lock mode */ + enum fuse_page_lock_mode mode; + /* Temporary list entry for operations */ + struct list_head list; +}; + +/* Lock modes for FUSE page cache */ +#define FUSE_PCACHE_LK_READ 1 /* Shared read lock */ +#define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ + +/* Interval tree definitions for page ranges */ +static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +{ + return range->start; +} + +static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +{ + return range->end; +} + +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); + +/** + * fuse_page_cache_init - Initialize a page cache lock manager + * @cache: The cache to initialize + * + * Initialize a page cache lock manager for a FUSE inode. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_cache_init(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + + if (!cache) + return -EINVAL; + + init_rwsem(&cache->lock); + cache->ranges = RB_ROOT_CACHED; + + return 0; +} + +/** + * fuse_page_cache_destroy - Clean up a page cache lock manager + * @cache: The cache to clean up + * + * Release all locks and free all resources associated with the cache. + */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + struct rb_node *node; + + if (!cache) + return; + + /* Release all locks */ + down_write(&cache->lock); + while ((node = rb_first_cached(&cache->ranges)) != NULL) { + range = rb_entry(node, struct fuse_dlm_range, rb); + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + up_write(&cache->lock); +} + +/** + * fuse_dlm_find_overlapping - Find a range that overlaps with [start, end] + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Return: Pointer to the first overlapping range, or NULL if none found + */ +static struct fuse_dlm_range * +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + return fuse_page_it_iter_first(&cache->ranges, start, end); +} + +/** + * fuse_page_try_merge - Try to merge ranges within a specific region + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Attempt to merge ranges within and adjacent to the specified region + * that have the same lock mode. + */ +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *next; + struct rb_node *node; + + if (!cache) + return; + + /* Find the first range that might need merging */ + range = NULL; + node = rb_first_cached(&cache->ranges); + while (node) { + range = rb_entry(node, struct fuse_dlm_range, rb); + if (range->end >= start - 1) + break; + node = rb_next(node); + } + + if (!range || range->start > end + 1) + return; + + /* Try to merge ranges in and around the specified region */ + while (range && range->start <= end + 1) { + /* Get next range before we potentially modify the tree */ + next = NULL; + if (rb_next(&range->rb)) { + next = rb_entry(rb_next(&range->rb), + struct fuse_dlm_range, rb); + } + + /* Try to merge with next range if adjacent and same mode */ + if (next && range->mode == next->mode && + range->end + 1 == next->start) { + /* Merge ranges */ + range->end = next->end; + + /* Remove next from tree */ + fuse_page_it_remove(next, &cache->ranges); + kfree(next); + + /* Continue with the same range */ + continue; + } + + /* Move to next range */ + range = next; + } +} + +/** + * fuse_dlm_lock_range - Lock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode (read or write) + * + * Add a locked range on the specified range of pages. + * If parts of the range are already locked, only add the remaining parts. + * For overlapping ranges, handle lock compatibility: + * - READ locks are compatible with existing READ locks + * - READ locks are compatible with existing WRITE locks (downgrade not needed) + * - WRITE locks need to upgrade existing READ locks + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *new_range, *next; + int lock_mode; + int ret = 0; + LIST_HEAD(to_lock); + LIST_HEAD(to_upgrade); + pgoff_t current_start = start; + + if (!cache || start > end) + return -EINVAL; + + /* Convert to lock mode */ + lock_mode = (mode == FUSE_PAGE_LOCK_READ) ? FUSE_PCACHE_LK_READ : + FUSE_PCACHE_LK_WRITE; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check lock compatibility */ + if (lock_mode == FUSE_PCACHE_LK_WRITE && + lock_mode != range->mode) { + /* we own the lock but have to update it. */ + list_add_tail(&range->list, &to_upgrade); + } + /* If WRITE lock already exists - nothing to do */ + + /* If there's a gap before this range, we need to add the missing range */ + if (current_start < range->start) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = range->start - 1; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* Move current_start past this range */ + current_start = max(current_start, range->end + 1); + + /* Move to next range */ + range = next; + } + + /* If there's a gap after the last range to the end, extend the range */ + if (current_start <= end) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = end; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* update locks, if any lock is in this list it has the wrong mode */ + list_for_each_entry(range, &to_upgrade, list) { + /* Update the lock mode */ + range->mode = lock_mode; + } + + /* Add all new ranges to the tree */ + list_for_each_entry(new_range, &to_lock, list) { + /* Add to interval tree */ + fuse_page_it_insert(new_range, &cache->ranges); + } + + /* Try to merge adjacent ranges with the same mode */ + fuse_dlm_try_merge(cache, start, end); + + up_write(&cache->lock); + return 0; + +out_free: + /* Free any ranges we allocated but didn't insert */ + while (!list_empty(&to_lock)) { + new_range = + list_first_entry(&to_lock, struct fuse_dlm_range, list); + list_del(&new_range->list); + kfree(new_range); + } + + /* Restore original lock modes for any partially upgraded locks */ + list_for_each_entry(range, &to_upgrade, list) { + if (lock_mode == FUSE_PCACHE_LK_WRITE) { + /* We upgraded this lock but failed later, downgrade it back */ + range->mode = FUSE_PCACHE_LK_READ; + } + } + + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_punch_hole - Punch a hole in a locked range + * @cache: The page cache + * @start: Start page offset of the hole + * @end: End page offset of the hole + * + * Create a hole in a locked range by splitting it into two ranges. + * + * Return: 0 on success, negative error code on failure + */ +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *new_range; + int ret = 0; + + if (!cache || start > end) + return -EINVAL; + + /* Find a range that contains [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + if (!range) { + ret = -EINVAL; + goto out; + } + + /* If the hole is at the beginning of the range */ + if (start == range->start) { + range->start = end + 1; + goto out; + } + + /* If the hole is at the end of the range */ + if (end == range->end) { + range->end = start - 1; + goto out; + } + + /* The hole is in the middle, need to split */ + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out; + } + + /* Copy properties from original range */ + *new_range = *range; + INIT_LIST_HEAD(&new_range->list); + + /* Adjust ranges */ + new_range->start = end + 1; + range->end = start - 1; + + /* Update interval tree */ + fuse_page_it_remove(range, &cache->ranges); + fuse_page_it_insert(range, &cache->ranges); + fuse_page_it_insert(new_range, &cache->ranges); + +out: + return ret; +} + +/** + * fuse_dlm_unlock_range - Unlock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Release locks on the specified range of pages. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, + pgoff_t start, pgoff_t end) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *next; + int ret = 0; + + if (!cache) + return -EINVAL; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check if we need to punch a hole */ + if (start > range->start && end < range->end) { + /* Punch a hole in the middle */ + ret = fuse_dlm_punch_hole(cache, start, end); + if (ret) + goto out; + /* After punching a hole, we're done */ + break; + } else if (start > range->start) { + /* Adjust the end of the range */ + range->end = start - 1; + } else if (end < range->end) { + /* Adjust the start of the range */ + range->start = end + 1; + } else { + /* Complete overlap, remove the range */ + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + + range = next; + } + +out: + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_range_is_locked - Check if a page range is already locked + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode to check for (or NULL to check for any lock) + * + * Check if the specified range of pages is already locked. + * The entire range must be locked for this to return true. + * + * Return: true if the entire range is locked, false otherwise + */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + int lock_mode = 0; + pgoff_t current_start = start; + + if (!cache || start > end) + return false; + + /* Convert to lock mode if specified */ + if (mode == FUSE_PAGE_LOCK_READ) + lock_mode = FUSE_PCACHE_LK_READ; + else if (mode == FUSE_PAGE_LOCK_WRITE) + lock_mode = FUSE_PCACHE_LK_WRITE; + + down_read(&cache->lock); + + /* Find the first range that overlaps with [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + + /* Check if the entire range is covered */ + while (range && current_start <= end) { + /* If we're checking for a specific mode, verify it matches */ + if (lock_mode && range->mode != lock_mode) { + /* Wrong lock mode */ + up_read(&cache->lock); + return false; + } + + /* Check if there's a gap before this range */ + if (current_start < range->start) { + /* Found a gap */ + up_read(&cache->lock); + return false; + } + + /* Move current_start past this range */ + current_start = range->end + 1; + + /* Get next overlapping range */ + range = fuse_page_it_iter_next(range, start, end); + } + + /* Check if we covered the entire range */ + if (current_start <= end) { + /* There's a gap at the end */ + up_read(&cache->lock); + return false; + } + + up_read(&cache->lock); + return true; +} + +/** + * request a dlm lock from the fuse server + */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + + /* note that the offset and length don't have to be page aligned here + * but since we only get here on writeback caching we will send out + * page aligned requests */ + offset &= PAGE_MASK; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + /* note that this can be run from different processes + * at the same time. It is intentionally not protected + * since a DLM implementation in the FUSE server should take care + * of any races in lock requests */ + if (fuse_dlm_range_is_locked(fi, offset, + end, FUSE_PAGE_LOCK_WRITE)) + return; /* we already have this area locked */ + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = end - offset + 1; + inarg.type = FUSE_DLM_LOCK_WRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + /* fuse server does not support dlm, save the info */ + fc->dlm = 0; + return; + } + + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } + + if (err) + return; + else + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, + offset + outarg.locksize - 1, + FUSE_PAGE_LOCK_WRITE); +} diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h new file mode 100644 index 00000000000000..98b27a2c15d8ba --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * FUSE page cache lock implementation + */ + +#ifndef _FS_FUSE_DLM_CACHE_H +#define _FS_FUSE_DLM_CACHE_H + +#include +#include +#include +#include + + +struct fuse_inode; + +/* Lock modes for page ranges */ +enum fuse_page_lock_mode { FUSE_PAGE_LOCK_READ, FUSE_PAGE_LOCK_WRITE }; + +/* Page cache lock manager */ +struct fuse_dlm_cache { + /* Lock protecting the tree */ + struct rw_semaphore lock; + /* Interval tree of locked ranges */ + struct rb_root_cached ranges; +}; + +/* Initialize a page cache lock manager */ +int fuse_dlm_cache_init(struct fuse_inode *inode); + +/* Clean up a page cache lock manager */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode); + +/* Lock a range of pages */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* Unlock a range of pages */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end); + +/* Check if a page range is already locked */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* this is the interface to the filesystem */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length); + +#endif /* _FS_FUSE_DLM_CACHE_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7f16049387d15e..eacd1e735dc5b6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -31,6 +31,7 @@ #include #include #include +#include "fuse_dlm_cache.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -113,6 +114,17 @@ struct fuse_backing { struct rcu_head rcu; }; +/** + * data structure to save the information that we have + * requested dlm locks for the given area from the fuse server +*/ +struct dlm_locked_area +{ + struct list_head list; + loff_t offset; + size_t size; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -168,6 +180,9 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; + + /* dlm locked areas we have sent lock requests for */ + struct fuse_dlm_cache dlm_locked_areas; }; /* readdir cache (directory only) */ @@ -909,6 +924,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* do we have support for dlm in the fs? */ + unsigned int dlm:1; + /** Passthrough support for read/write IO */ unsigned int passthrough:1; diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 393c630e772635..9976e31a51a9c9 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,6 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e0f3f1ab08a1f4..f3ff39627a02bd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include "fuse_dev_i.h" #include "dev_uring_i.h" @@ -195,6 +196,7 @@ static void fuse_evict_inode(struct inode *inode) WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); + fuse_dlm_cache_release_locks(fi); } } @@ -578,6 +580,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; + + if (fc->dlm && fc->writeback_cache) + /* invalidate the range from the beginning of the first page + * in the given range to the last byte of the last page */ + fuse_dlm_unlock_range(fi, + pg_start << PAGE_SHIFT, + (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -991,6 +1001,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->blocked = 0; fc->initialized = 0; fc->connected = 1; + fc->dlm = 1; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c13e1f9a2f12bd..d4139185c7491c 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -663,6 +663,7 @@ enum fuse_opcode { FUSE_TMPFILE = 51, FUSE_STATX = 52, FUSE_COPY_FILE_RANGE_64 = 53, + FUSE_DLM_WB_LOCK = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1245,6 +1246,41 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Type of the dlm lock requested + */ +enum fuse_dlm_lock_type { + FUSE_DLM_LOCK_NONE = 0, + FUSE_DLM_LOCK_READ = 1, + FUSE_DLM_LOCK_WRITE = 2 +}; + +/** + * struct fuse_dlm_lock_in - Lock request + * @fh: file handle + * @offset: offset into the file + * @size: size of the locked region + * @type: type of lock + */ +struct fuse_dlm_lock_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t type; + uint64_t reserved; +}; + +/** + * struct fuse_dlm_lock_out - Lock response + * @locksize: how many bytes where locked by the call + * (most of the time we want to lock more than is requested + * to reduce number of calls) + */ +struct fuse_dlm_lock_out { + uint32_t locksize; + uint32_t padding; +}; + /** * Size of the ring buffer header */ From ec05a1c73fa95bb0dd99c87bc6f3344600afe01c Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Thu, 17 Jul 2025 17:04:16 +0000 Subject: [PATCH 07/46] fuse: Renumber FUSE_DLM_WB_LOCK to 100 Renumber the operation code to a high value to avoid conflicts with upstream. (imported from commit 27a0e9ea714f7fcf3ee40f977be6a17c10766509) --- fs/fuse/fuse_trace.h | 2 +- include/uapi/linux/fuse.h | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 9976e31a51a9c9..e81c93b9614627 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,7 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ - EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d4139185c7491c..6828ceb1216d3c 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -662,8 +662,10 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, - FUSE_COPY_FILE_RANGE_64 = 53, - FUSE_DLM_WB_LOCK = 53, + FUSE_COPY_FILE_RANGE_64 = 53, + + /* Operations which have not been merged into upstream */ + FUSE_DLM_WB_LOCK = 100, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1252,7 +1254,7 @@ struct fuse_supp_groups { enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, - FUSE_DLM_LOCK_WRITE = 2 + FUSE_DLM_LOCK_WRITE = 2, }; /** From d25d13aaa327b7de796b68cef3c9e5b4f86877e2 Mon Sep 17 00:00:00 2001 From: Yong Ze Chen Date: Tue, 8 Jul 2025 06:41:45 +0000 Subject: [PATCH 08/46] fuse: invalidate inode aliases when doing inode invalidation Add support to invalidate inode aliases when doing inode invalidation. This is useful for distributed file systems, which use DLM for cache coherency. So, when a client losts its inode lock, it should invalidate its inode cache and dentry cache since the other client may delete this file after getting inode lock. Signed-off-by: Yong Ze Chen (imported from commit 49720b5c84ada61feeb09da9ad4b9a0a40694792) --- fs/fuse/fuse_i.h | 6 +++++ fs/fuse/inode.c | 49 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fuse.h | 4 ++++ 3 files changed, 59 insertions(+) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eacd1e735dc5b6..07c9704ac58672 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -787,6 +787,12 @@ struct fuse_conn { */ unsigned handle_killpriv_v2:1; + /* invalidate inode entries when doing inode invalidation */ + unsigned inval_inode_entries:1; + + /* expire inode entries when doing inode invalidation */ + unsigned expire_inode_entries:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f3ff39627a02bd..81c7bfd2184d34 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -555,6 +555,45 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, return NULL; } +static void fuse_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); +} + +static void fuse_invalidate_inode_entry(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + /* For directories, use d_invalidate to handle children and submounts */ + dentry = d_find_alias(inode); + if (dentry) { + d_invalidate(dentry); + fuse_invalidate_entry_cache(dentry); + dput(dentry); + } + } else { + /* For regular files, just unhash the dentry */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + } +} + int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { @@ -572,6 +611,11 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); + if (fc->inval_inode_entries) + fuse_invalidate_inode_entry(inode); + else if (fc->expire_inode_entries) + fuse_prune_aliases(inode); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -1467,6 +1511,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if (flags & FUSE_REQUEST_TIMEOUT) timeout = arg->request_timeout; + if (flags & FUSE_INVAL_INODE_ENTRY) + fc->inval_inode_entries = 1; + if (flags & FUSE_EXPIRE_INODE_ENTRY) + fc->expire_inode_entries = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1519,6 +1567,7 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY | FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 6828ceb1216d3c..a9cdacfb76da48 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -448,6 +448,8 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. * init_out.request_timeout contains the timeout (in secs) + * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation + * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -495,6 +497,8 @@ struct fuse_file_lock { #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) #define FUSE_REQUEST_TIMEOUT (1ULL << 42) +#define FUSE_INVAL_INODE_ENTRY (1ULL << 60) +#define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) /** * CUSE INIT request/reply flags From 440ccd8458a1a546ecee398db04ca4be1b21db6b Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:18:06 +0000 Subject: [PATCH 09/46] fuse: Send DLM_WB_LOCK request in page_mkwrite handler Send a DLM_WB_LOCK request in the page_mkwrite handler to enable FUSE filesystems to acquire a distributed lock manager (DLM) lock for protecting upcoming dirty pages when a previously read-only mapped page is about to be written. Signed-off-by: Cheng Ding (imported from commit ec36c455214837e9ce0d3f3385a0bb50dcfb51db) --- fs/fuse/file.c | 64 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 1 + 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 63b45e74356743..75e9116194aaeb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2339,6 +2339,57 @@ static void fuse_vma_close(struct vm_area_struct *vma) mapping_set_error(vma->vm_file->f_mapping, err); } +/** + * Request a DLM lock from the FUSE server. + * + * This routine is similar to fuse_get_dlm_write_lock(), but it + * does not cache the DLM lock in the kernel. + */ +static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = ff->fm; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + if (WARN_ON_ONCE((offset & ~PAGE_MASK) || (length & ~PAGE_MASK))) + return -EIO; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = length; + inarg.type = FUSE_DLM_PAGE_MKWRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->dlm = 0; + err = 0; + } + + if (!err && outarg.locksize < length) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", + length, outarg.locksize); + fuse_abort_conn(fc); + err = -EINVAL; + } + return err; +} /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly @@ -2357,7 +2408,18 @@ static void fuse_vma_close(struct vm_area_struct *vma) static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); - struct inode *inode = file_inode(vmf->vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + + if (fm->fc->dlm) { + loff_t pos = vmf->pgoff << PAGE_SHIFT; + size_t length = PAGE_SIZE; + int err = fuse_get_page_mkwrite_lock(file, pos, length); + if (err < 0) { + return vmf_error(err); + } + } file_update_time(vmf->vma->vm_file); folio_lock(folio); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index a9cdacfb76da48..e3acfb4aa34269 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1259,6 +1259,7 @@ enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, FUSE_DLM_LOCK_WRITE = 2, + FUSE_DLM_PAGE_MKWRITE = 3, }; /** From dbfa78c1ff1e0b88be901fbad32e6e09f46655fe Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:20:08 +0000 Subject: [PATCH 10/46] fuse: Allow read_folio to retry page fault and read operations Allow read_folio to return EAGAIN error and translate it to AOP_TRUNCATE_PAGE to retry page fault and read operations. This is used to prevent deadlock of folio lock/DLM lock order reversal: - Fault or read operations acquire folio lock first, then DLM lock. - FUSE daemon blocks new DLM lock acquisition while it invalidating page cache. invalidate_inode_pages2_range() acquires folio lock To prevent deadlock, the FUSE daemon will fail its DLM lock acquisition with EAGAIN if it detects an in-flight page cache invalidating operation. Signed-off-by: Cheng Ding (imported from commit 8ecf1182053891c6458b10be1272d2d562492fbd) --- fs/fuse/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 75e9116194aaeb..c029641d34f5d3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -836,8 +836,11 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); - if (res < 0) + if (res < 0) { + if (res == -EAGAIN) + res = AOP_TRUNCATED_PAGE; return res; + } /* * Short read means EOF. If file size is larger, truncate it */ From a6dc0c66a609ce3eab2ba8ff380cc64bcd79c788 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 17 Jul 2025 16:26:51 -0700 Subject: [PATCH 11/46] fuse: flush pending fuse events before aborting the connection generic/488 fails with fuse2fs in the following fashion: generic/488 _check_generic_filesystem: filesystem on /dev/sdf is inconsistent (see /var/tmp/fstests/generic/488.full for details) This test opens a large number of files, unlinks them (which really just renames them to fuse hidden files), closes the program, unmounts the filesystem, and runs fsck to check that there aren't any inconsistencies in the filesystem. Unfortunately, the 488.full file shows that there are a lot of hidden files left over in the filesystem, with incorrect link counts. Tracing fuse_request_* shows that there are a large number of FUSE_RELEASE commands that are queued up on behalf of the unlinked files at the time that fuse_conn_destroy calls fuse_abort_conn. Had the connection not aborted, the fuse server would have responded to the RELEASE commands by removing the hidden files; instead they stick around. Create a function to push all the background requests to the queue and then wait for the number of pending events to hit zero, and call this before fuse_abort_conn. That way, all the pending events are processed by the fuse server and we don't end up with a corrupt filesystem. Signed-off-by: Darrick J. Wong (imported from commit d4262f9cf5232394d518207863d1ad79f52b179e) --- fs/fuse/dev.c | 39 ++++++++++++++++++++++++++++++++++++++- fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 1 + 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1f107b160778fb..c076904e8e4b95 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include "fuse_trace.h" @@ -2445,6 +2445,43 @@ static void end_polls(struct fuse_conn *fc) } } +/* + * Flush all pending requests and wait for them. Only call this function when + * it is no longer possible for other threads to add requests. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) +{ + unsigned long deadline; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + /* Push all the background requests to the queue. */ + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + spin_unlock(&fc->lock); + + /* + * Wait 30s for all the events to complete or abort. Touch the + * watchdog once per second so that we don't trip the hangcheck timer + * while waiting for the fuse server. + */ + deadline = jiffies + timeout; + smp_mb(); + while (fc->connected && + (!timeout || time_before(jiffies, deadline)) && + wait_event_timeout(fc->blocked_waitq, + !fc->connected || atomic_read(&fc->num_waiting) == 0, + HZ) == 0) + touch_softlockup_watchdog(); +} + /* * Abort all requests. * diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07c9704ac58672..dbefbcf3c14d5f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1319,6 +1319,12 @@ void fuse_dentry_tree_cleanup(void); void fuse_epoch_work(struct work_struct *work); +/** + * Flush all pending requests and wait for them. Takes an optional timeout + * in jiffies. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 81c7bfd2184d34..e2563152bd6727 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2150,6 +2150,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + fuse_flush_requests(fc, 30 * HZ); if (fc->destroy) fuse_send_destroy(fm); From 1f531ed564567e342cefdc5e39fa418f57a7932d Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 17:24:42 +0200 Subject: [PATCH 12/46] fuse: Refactor io-uring bg queue flush and queue abort This is a preparation to allow fuse-io-uring bg queue flush from flush_bg_queue() This does two function renames: fuse_uring_flush_bg -> fuse_uring_flush_queue_bg fuse_uring_abort_end_requests -> fuse_uring_flush_bg And fuse_uring_abort_end_queue_requests() is moved to fuse_uring_stop_queues(). Signed-off-by: Bernd Schubert (imported from commit e70ef24251116bc7f591a9a856c371549cd5ae77) --- fs/fuse/dev_uring.c | 14 +++++++------- fs/fuse/dev_uring_i.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index d5737245516b01..04888f8b263592 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -51,7 +51,7 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } -static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +static void fuse_uring_flush_queue_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; @@ -93,7 +93,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } @@ -122,11 +122,11 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) fuse_dev_end_requests(&req_list); } -void fuse_uring_abort_end_requests(struct fuse_ring *ring) +void fuse_uring_flush_bg(struct fuse_conn *fc) { int qid; struct fuse_ring_queue *queue; - struct fuse_conn *fc = ring->fc; + struct fuse_ring *ring = fc->ring; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); @@ -138,10 +138,9 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); - fuse_uring_abort_end_queue_requests(queue); } } @@ -498,6 +497,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) if (!queue) continue; + fuse_uring_abort_end_queue_requests(queue); fuse_uring_teardown_entries(queue); } @@ -1536,7 +1536,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); /* diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index c89c7dc27c76c1..ea86d4084e7676 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -142,7 +142,7 @@ struct fuse_ring { bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); -void fuse_uring_abort_end_requests(struct fuse_ring *ring); +void fuse_uring_flush_bg(struct fuse_conn *fc); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); @@ -157,7 +157,7 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) return; if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_abort_end_requests(ring); + fuse_uring_flush_bg(fc); fuse_uring_stop_queues(ring); } } From 07447370363cda5d88f7b50beb6fc801b47cf93f Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 18:24:41 +0200 Subject: [PATCH 13/46] fuse: Flush the io-uring bg queue from fuse_uring_flush_bg This is useful to have a unique API to flush background requests. For example when the bg queue gets flushed before the remaining of fuse_conn_destroy(). Signed-off-by: Bernd Schubert (imported from commit fc4120cc58e7fbcb541bf2e9a72781b569561912) --- fs/fuse/dev.c | 2 ++ fs/fuse/dev_uring.c | 3 +++ fs/fuse/dev_uring_i.h | 8 ++++++++ 3 files changed, 13 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c076904e8e4b95..d5e62f132e1dba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2467,6 +2467,8 @@ void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) spin_unlock(&fc->bg_lock); spin_unlock(&fc->lock); + fuse_uring_flush_bg(fc); + /* * Wait 30s for all the events to complete or abort. Touch the * watchdog once per second so that we don't trip the hangcheck timer diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 04888f8b263592..fde1b6100d3218 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -128,6 +128,9 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) struct fuse_ring_queue *queue; struct fuse_ring *ring = fc->ring; + if (!ring) + return; + for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index ea86d4084e7676..305c5869fde251 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -210,6 +210,14 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc) return false; } +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_flush_bg(struct fuse_conn *fc) +{ +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ From 8c810fa32f95921ed46ddc01ecbd950fc50de10b Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 15:54:09 +0200 Subject: [PATCH 14/46] fuse: fix unnecessary connection abort in dlm lock acquiring When calling the fuse server with a dlm request and the fuse server responds with some other error than ENOSYS most likely the lock size will be set to zero. In that case the kernel will abort the fuse connection. This is completely unnecessary. Signed-off-by: Horst Birthelmer (imported from commit 0bc2f9c39c52ad11a1753e5be376c424b06f43db) --- fs/fuse/fuse_dlm_cache.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index ea947f34a9f70a..a9cad2c1bd2174 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -533,19 +533,19 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, return; } - if (outarg.locksize < end - offset + 1) { - /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); - fuse_abort_conn(fc); - return; - } - if (err) return; else - /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } else { + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, offset + outarg.locksize - 1, FUSE_PAGE_LOCK_WRITE); + } } From 1d4a8b3c1847f38f8a2728eacbabcaba0d308ff5 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 18:15:55 +0200 Subject: [PATCH 15/46] fuse: fix connection abort on mmap when fuse server returns ENOSYS Check whether dlm is still enabled when interpreting the returned error from fuse server. Signed-off-by: Horst Birthelmer (imported from commit f6fbf7c7bfb976ae2a30b4d699770a13e699ff04) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c029641d34f5d3..bed01bf8121658 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2384,7 +2384,7 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && outarg.locksize < length) { + if (!err && fc->dlm && outarg.locksize < length) { /* fuse server is seriously broken */ pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", length, outarg.locksize); From 3ce2031a3fb71619d72d1b153ace0235be15a882 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Wed, 20 Aug 2025 16:56:43 +0200 Subject: [PATCH 16/46] fuse: change FUSE DLM_LOCK to request start and end of area - Increase the possible lock size to 64 bit. - change semantics of DLM locks to request start and end - change semantics of DLM request return to mark start and end of the locked area - better prepare dlm lock range cache rb-tree for unaligned byte range locks which could return any value as long as it is larger than the range requested - add the case where start and end are zero to destroy the cache Signed-off-by: Horst Birthelmer (imported from commit 87968c738b67b07084b19b5e727074c0604d7ba6) --- fs/fuse/file.c | 13 +++++--- fs/fuse/fuse_dlm_cache.c | 67 +++++++++++++++++++++------------------ fs/fuse/fuse_dlm_cache.h | 12 +++---- fs/fuse/inode.c | 11 ++++--- include/uapi/linux/fuse.h | 11 ++++--- 5 files changed, 64 insertions(+), 50 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bed01bf8121658..aac4702b08cd77 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2366,8 +2366,8 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.offset = offset; - inarg.size = length; + inarg.start = offset; + inarg.end = offset + length - 1; inarg.type = FUSE_DLM_PAGE_MKWRITE; args.opcode = FUSE_DLM_WB_LOCK; @@ -2384,10 +2384,13 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && fc->dlm && outarg.locksize < length) { + if (!err && + fc->dlm && + (outarg.start > inarg.start || + outarg.end < inarg.end)) { /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", - length, outarg.locksize); + pr_warn("fuse: dlm lock request for %llu:%llu bytes returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); fuse_abort_conn(fc); err = -EINVAL; } diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index a9cad2c1bd2174..d765dd8018cc6a 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -16,11 +16,11 @@ struct fuse_dlm_range { /* Interval tree node */ struct rb_node rb; /* Start page offset (inclusive) */ - pgoff_t start; + uint64_t start; /* End page offset (inclusive) */ - pgoff_t end; + uint64_t end; /* Subtree end value for interval tree */ - pgoff_t __subtree_end; + uint64_t __subtree_end; /* Lock mode */ enum fuse_page_lock_mode mode; /* Temporary list entry for operations */ @@ -32,19 +32,19 @@ struct fuse_dlm_range { #define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ /* Interval tree definitions for page ranges */ -static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +static inline uint64_t fuse_dlm_range_start(struct fuse_dlm_range *range) { return range->start; } -static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +static inline uint64_t fuse_dlm_range_last(struct fuse_dlm_range *range) { return range->end; } -INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, - fuse_dlm_range_start, fuse_dlm_range_last, static, - fuse_page_it); +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, uint64_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); /** * fuse_page_cache_init - Initialize a page cache lock manager @@ -101,8 +101,8 @@ void fuse_dlm_cache_release_locks(struct fuse_inode *inode) * Return: Pointer to the first overlapping range, or NULL if none found */ static struct fuse_dlm_range * -fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { return fuse_page_it_iter_first(&cache->ranges, start, end); } @@ -116,8 +116,8 @@ fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, * Attempt to merge ranges within and adjacent to the specified region * that have the same lock mode. */ -static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { struct fuse_dlm_range *range, *next; struct rb_node *node; @@ -182,8 +182,8 @@ static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, * * Return: 0 on success, negative error code on failure */ -int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode) +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range, *new_range, *next; @@ -191,7 +191,7 @@ int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, int ret = 0; LIST_HEAD(to_lock); LIST_HEAD(to_upgrade); - pgoff_t current_start = start; + uint64_t current_start = start; if (!cache || start > end) return -EINVAL; @@ -304,8 +304,8 @@ int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, * * Return: 0 on success, negative error code on failure */ -static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, - pgoff_t end) +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) { struct fuse_dlm_range *range, *new_range; int ret = 0; @@ -363,11 +363,12 @@ static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, * @end: End page offset * * Release locks on the specified range of pages. + * Note that if start and end are set to zero the cache is destroyed. * * Return: 0 on success, negative error code on failure */ int fuse_dlm_unlock_range(struct fuse_inode *inode, - pgoff_t start, pgoff_t end) + uint64_t start, uint64_t end) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range, *next; @@ -376,6 +377,11 @@ int fuse_dlm_unlock_range(struct fuse_inode *inode, if (!cache) return -EINVAL; + if (start == 0 && end == 0) { + fuse_dlm_cache_release_locks(inode); + return 0; + } + down_write(&cache->lock); /* Find all ranges that overlap with [start, end] */ @@ -424,13 +430,13 @@ int fuse_dlm_unlock_range(struct fuse_inode *inode, * * Return: true if the entire range is locked, false otherwise */ -bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode) +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) { struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; struct fuse_dlm_range *range; int lock_mode = 0; - pgoff_t current_start = start; + uint64_t current_start = start; if (!cache || start > end) return false; @@ -491,7 +497,7 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_mount *fm = ff->fm; - loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + uint64_t end = (offset + length - 1) | (PAGE_SIZE - 1); /* note that the offset and length don't have to be page aligned here * but since we only get here on writeback caching we will send out @@ -514,8 +520,8 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.offset = offset; - inarg.size = end - offset + 1; + inarg.start = offset; + inarg.end = end; inarg.type = FUSE_DLM_LOCK_WRITE; args.opcode = FUSE_DLM_WB_LOCK; @@ -536,16 +542,17 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, if (err) return; else - if (outarg.locksize < end - offset + 1) { + if (inarg.start < outarg.start || + inarg.end > outarg.end) { /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); + pr_warn("fuse: dlm lock request for %llu:%llu returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); fuse_abort_conn(fc); return; } else { /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, - offset + outarg.locksize - 1, - FUSE_PAGE_LOCK_WRITE); + fuse_dlm_lock_range(fi, outarg.start, + outarg.end, + FUSE_PAGE_LOCK_WRITE); } } diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h index 98b27a2c15d8ba..438d31d28b666e 100644 --- a/fs/fuse/fuse_dlm_cache.h +++ b/fs/fuse/fuse_dlm_cache.h @@ -32,16 +32,16 @@ int fuse_dlm_cache_init(struct fuse_inode *inode); void fuse_dlm_cache_release_locks(struct fuse_inode *inode); /* Lock a range of pages */ -int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode); +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); /* Unlock a range of pages */ -int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, - pgoff_t end); +int fuse_dlm_unlock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end); /* Check if a page range is already locked */ -bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, - pgoff_t end, enum fuse_page_lock_mode mode); +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); /* this is the interface to the filesystem */ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e2563152bd6727..7d1f936f8a1ff2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -626,11 +626,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = (offset + len - 1) >> PAGE_SHIFT; if (fc->dlm && fc->writeback_cache) - /* invalidate the range from the beginning of the first page - * in the given range to the last byte of the last page */ + /* Invalidate the range exactly as the fuse server requested + * except for the case where it sends -1. + * Note that this can lead to some inconsistencies if + * the fuse server sends unaligned data */ fuse_dlm_unlock_range(fi, - pg_start << PAGE_SHIFT, - (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + offset, + pg_end == -1 ? 0 : + (offset + len - 1)); invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e3acfb4aa34269..dd463d13585043 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1271,10 +1271,10 @@ enum fuse_dlm_lock_type { */ struct fuse_dlm_lock_in { uint64_t fh; - uint64_t offset; - uint32_t size; + uint64_t start; + uint64_t end; uint32_t type; - uint64_t reserved; + uint32_t reserved; }; /** @@ -1284,8 +1284,9 @@ struct fuse_dlm_lock_in { * to reduce number of calls) */ struct fuse_dlm_lock_out { - uint32_t locksize; - uint32_t padding; + uint64_t start; + uint64_t end; + uint64_t reserved; }; /** From 13eef8050601d7f269fed15a26883f1a95a9d520 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 24 Sep 2025 08:12:17 +0000 Subject: [PATCH 17/46] fuse: fix memory leak in fuse-over-io-uring argument copies Fix reference count leak of payload pages during fuse argument copies. Signed-off-by: Cheng Ding (imported from commit 8b75cf05a2efc20e8f46ba9e10664c502249ee21) --- fs/fuse/dev_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index fde1b6100d3218..9c72223678e0be 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -674,11 +674,14 @@ static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, (struct fuse_arg *)in_args, 0); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + +copy_finish: + fuse_copy_finish(&cs); return err; } @@ -735,12 +738,14 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_finish(&cs); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); +copy_finish: + fuse_copy_finish(&cs); return err ? -EFAULT : 0; } From cdd444eb3b074e2d5a678a9b00e81d72498fd526 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 2 Jun 2025 23:23:43 +0200 Subject: [PATCH 18/46] fuse: {io-uring} Add queue length counters This is another preparation and will be used for decision which queue to add a request to. Signed-off-by: Bernd Schubert Reviewed-by: Joanne Koong (imported from commit e4698faf912435f7f3f28c169f7bb8342d7b1edf) --- fs/fuse/dev_uring.c | 17 +++++++++++++++-- fs/fuse/dev_uring_i.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 9c72223678e0be..9bb6573c826173 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -89,6 +89,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + queue->nr_reqs--; list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; @@ -96,7 +97,6 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } - spin_unlock(&queue->lock); if (error) @@ -116,6 +116,7 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); + queue->nr_reqs = 0; spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ @@ -1498,10 +1499,13 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); + queue->nr_reqs++; + if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); if (ent) @@ -1537,6 +1541,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) set_bit(FR_URING, &req->flags); req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); + queue->nr_reqs++; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); @@ -1569,8 +1574,16 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) bool fuse_uring_remove_pending_req(struct fuse_req *req) { struct fuse_ring_queue *queue = req->ring_queue; + bool removed = fuse_remove_pending_req(req, &queue->lock); + + if (removed) { + /* Update counters after successful removal */ + spin_lock(&queue->lock); + queue->nr_reqs--; + spin_unlock(&queue->lock); + } - return fuse_remove_pending_req(req, &queue->lock); + return removed; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 305c5869fde251..f4e707a6711138 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -98,6 +98,9 @@ struct fuse_ring_queue { /* background fuse requests */ struct list_head fuse_req_bg_queue; + /* number of requests queued or in userspace */ + unsigned int nr_reqs; + struct fuse_pqueue fpq; unsigned int active_background; From 9f6de8b57a2b3bf8acb377f3fe3e2759743e9be7 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 13 Jun 2025 15:12:47 +0200 Subject: [PATCH 19/46] fuse: {io-uring} Rename ring->nr_queues to max_nr_queues This is preparation for follow up commits that allow to run with a reduced number of queues. Signed-off-by: Bernd Schubert (imported from commit 2e27c33ffcf65b434ada1364a4d2ea92b094f0c3) --- fs/fuse/dev_uring.c | 22 +++++++++++----------- fs/fuse/dev_uring_i.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 9bb6573c826173..7e1a51d5037bce 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -132,7 +132,7 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -214,7 +214,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -277,7 +277,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - ring->nr_queues = nr_queues; + ring->max_nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; smp_store_release(&fc->ring, ring); @@ -429,7 +429,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) int qid; struct fuse_ring_ent *ent; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) @@ -460,7 +460,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -495,7 +495,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -988,7 +988,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, if (!ring) return err; - if (qid >= ring->nr_queues) + if (qid >= ring->max_nr_queues) return -EINVAL; queue = ring->queues[qid]; @@ -1051,7 +1051,7 @@ static bool is_ring_ready(struct fuse_ring *ring, int current_qid) struct fuse_ring_queue *queue; bool ready = true; - for (qid = 0; qid < ring->nr_queues && ready; qid++) { + for (qid = 0; qid < ring->max_nr_queues && ready; qid++) { if (current_qid == qid) continue; @@ -1291,7 +1291,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, return err; } - if (qid >= ring->nr_queues) { + if (qid >= ring->max_nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } @@ -1436,9 +1436,9 @@ static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) qid = task_cpu(current); - if (WARN_ONCE(qid >= ring->nr_queues, + if (WARN_ONCE(qid >= ring->max_nr_queues, "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->nr_queues)) + ring->max_nr_queues)) qid = 0; queue = ring->queues[qid]; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index f4e707a6711138..0a5e826100585f 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -117,7 +117,7 @@ struct fuse_ring { struct fuse_conn *fc; /* number of ring queues */ - size_t nr_queues; + size_t max_nr_queues; /* maximum payload/arg size */ size_t max_payload_sz; From 5a0f4bfbad64c9854cba9b97a829d45f7796da93 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 10 Jun 2025 16:23:28 +0200 Subject: [PATCH 20/46] fuse: {io-uring} Use bitmaps to track registered queues Add per-CPU and per-NUMA node bitmasks to track which io-uring queues are registered. Signed-off-by: Bernd Schubert (imported from commit be6edce441ecc37ee34a8937f07c01ab99bfb7f7) --- fs/fuse/dev_uring.c | 79 +++++++++++++++++++++++++++++++++++++++++-- fs/fuse/dev_uring_i.h | 20 +++++++++++ 2 files changed, 96 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 7e1a51d5037bce..dbbf3a7f949614 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -163,6 +163,24 @@ static void io_pages_free(struct page ***pages, int npages) *pages = NULL; } + +static void fuse_ring_destruct_q_map(struct fuse_queue_map *q_map) +{ + free_cpumask_var(q_map->registered_q_mask); + kfree(q_map->cpu_to_qid); +} + +static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) +{ + int node; + + fuse_ring_destruct_q_map(&ring->q_map); + + if (ring->numa_q_map) + for (node = 0; node < ring->nr_numa_nodes; node++) + fuse_ring_destruct_q_map(&ring->numa_q_map[node]); +} + static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) { struct fuse_ring_ent *ent; @@ -187,7 +205,7 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) if (!ring) return false; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -240,11 +258,45 @@ void fuse_uring_destruct(struct fuse_conn *fc) ring->queues[qid] = NULL; } + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); fc->ring = NULL; } +static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) +{ + if (!zalloc_cpumask_var(&q_map->registered_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), + GFP_KERNEL_ACCOUNT); + + return 0; +} + +static int fuse_uring_create_q_masks(struct fuse_ring *ring) +{ + int err, node; + + err = fuse_uring_init_q_map(&ring->q_map, ring->max_nr_queues); + if (err) + return err; + + ring->numa_q_map = kcalloc(ring->nr_numa_nodes, + sizeof(*ring->numa_q_map), + GFP_KERNEL_ACCOUNT); + if (!ring->numa_q_map) + return -ENOMEM; + for (node = 0; node < ring->nr_numa_nodes; node++) { + err = fuse_uring_init_q_map(&ring->numa_q_map[node], + ring->max_nr_queues); + if (err) + return err; + } + return 0; +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -254,19 +306,26 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; + int err; ring = kzalloc_obj(*fc->ring, GFP_KERNEL_ACCOUNT); if (!ring) return NULL; - ring->queues = kzalloc_objs(struct fuse_ring_queue *, nr_queues, - GFP_KERNEL_ACCOUNT); + ring->nr_numa_nodes = num_online_nodes(); + + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), + GFP_KERNEL_ACCOUNT); if (!ring->queues) goto out_err; max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + err = fuse_uring_create_q_masks(ring); + if (err) + goto out_err; + spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ @@ -286,6 +345,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return ring; out_err: + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); return res; @@ -448,6 +508,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } + spin_unlock(&queue->lock); } ring->stop_debug_log = 1; @@ -494,6 +555,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; + int node; for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); @@ -505,6 +567,13 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) fuse_uring_teardown_entries(queue); } + /* Reset all queue masks, we won't process any more IO */ + cpumask_clear(ring->q_map.registered_q_mask); + for (node = 0; node < ring->nr_numa_nodes; node++) { + if (ring->numa_q_map) + cpumask_clear(ring->numa_q_map[node].registered_q_mask); + } + if (atomic_read(&ring->queue_refs) > 0) { ring->teardown_time = jiffies; INIT_DELAYED_WORK(&ring->async_teardown_work, @@ -1081,6 +1150,10 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; struct fuse_iqueue *fiq = &fc->iq; + int node = cpu_to_node(queue->qid); + + if (WARN_ON_ONCE(node >= ring->nr_numa_nodes)) + node = 0; fuse_uring_prepare_cancel(cmd, issue_flags, ent); diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 0a5e826100585f..86fef37a863a1e 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -108,6 +108,17 @@ struct fuse_ring_queue { bool stopped; }; +struct fuse_queue_map { + /* Tracks which queues are registered */ + cpumask_var_t registered_q_mask; + + /* number of registered queues */ + size_t nr_queues; + + /* cpu to qid mapping */ + int *cpu_to_qid; +}; + /** * Describes if uring is for communication and holds alls the data needed * for uring communication @@ -119,6 +130,9 @@ struct fuse_ring { /* number of ring queues */ size_t max_nr_queues; + /* number of numa nodes */ + int nr_numa_nodes; + /* maximum payload/arg size */ size_t max_payload_sz; @@ -129,6 +143,12 @@ struct fuse_ring { */ unsigned int stop_debug_log : 1; + /* per numa node queue tracking */ + struct fuse_queue_map *numa_q_map; + + /* all queue tracking */ + struct fuse_queue_map q_map; + wait_queue_head_t stop_waitq; /* async tear down */ From a7f6de4e26a740eb9d5b0eb5adfdbd528094e926 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 4 Jun 2025 19:32:39 +0200 Subject: [PATCH 21/46] fuse: {io-uring} Allow reduced number of ring queues Queues selection (fuse_uring_get_queue) can handle reduced number queues - using io-uring is possible now even with a single queue and entry. The FUSE_URING_REDUCED_Q flag is being introduce tell fuse server that reduced queues are possible, i.e. if the flag is set, fuse server is free to reduce number queues. Signed-off-by: Bernd Schubert (imported from commit f620f3d35969bd9a04304b757a18a11a0787dedc) --- fs/fuse/dev_uring.c | 124 +++++++++++++++++++++++--------------- fs/fuse/inode.c | 8 +-- include/uapi/linux/fuse.h | 4 ++ 3 files changed, 84 insertions(+), 52 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index dbbf3a7f949614..b3927356be722a 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -271,15 +271,17 @@ static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), GFP_KERNEL_ACCOUNT); + if (!q_map->cpu_to_qid) + return -ENOMEM; return 0; } -static int fuse_uring_create_q_masks(struct fuse_ring *ring) +static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues) { int err, node; - err = fuse_uring_init_q_map(&ring->q_map, ring->max_nr_queues); + err = fuse_uring_init_q_map(&ring->q_map, nr_queues); if (err) return err; @@ -290,7 +292,7 @@ static int fuse_uring_create_q_masks(struct fuse_ring *ring) return -ENOMEM; for (node = 0; node < ring->nr_numa_nodes; node++) { err = fuse_uring_init_q_map(&ring->numa_q_map[node], - ring->max_nr_queues); + nr_queues); if (err) return err; } @@ -322,7 +324,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); - err = fuse_uring_create_q_masks(ring); + err = fuse_uring_create_q_masks(ring, nr_queues); if (err) goto out_err; @@ -351,12 +353,37 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return res; } +static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, + struct fuse_queue_map *q_map) +{ + int cpu, qid_idx; + size_t nr_queues; + + cpumask_set_cpu(qid, q_map->registered_q_mask); + nr_queues = cpumask_weight(q_map->registered_q_mask); + for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { + if (!q_map->cpu_to_qid) + return; + + /* + * Position of this CPU within the registered queue mask, + * handles non-contiguous CPU distributions across NUMA nodes. + */ + qid_idx = bitmap_weight( + cpumask_bits(q_map->registered_q_mask), cpu); + + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx % nr_queues, + q_map->registered_q_mask); + } +} + static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, int qid) { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; struct list_head *pq; + int node; queue = kzalloc_obj(*queue, GFP_KERNEL_ACCOUNT); if (!queue) @@ -394,6 +421,22 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, * write_once and lock as the caller mostly doesn't take the lock at all */ WRITE_ONCE(ring->queues[qid], queue); + + /* Static mapping from cpu to per numa queues */ + node = cpu_to_node(qid); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node]); + + /* + * smp_store_release, as the variable is read without fc->lock and + * we need to avoid compiler re-ordering of updating the nr_queues + * and setting ring->numa_queues[node].cpu_to_qid above + */ + smp_store_release (&ring->numa_q_map[node].nr_queues, + ring->numa_q_map[node].nr_queues + 1); + + /* global mapping */ + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map); + spin_unlock(&fc->lock); return queue; @@ -1114,31 +1157,6 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -static bool is_ring_ready(struct fuse_ring *ring, int current_qid) -{ - int qid; - struct fuse_ring_queue *queue; - bool ready = true; - - for (qid = 0; qid < ring->max_nr_queues && ready; qid++) { - if (current_qid == qid) - continue; - - queue = ring->queues[qid]; - if (!queue) { - ready = false; - break; - } - - spin_lock(&queue->lock); - if (list_empty(&queue->ent_avail_queue)) - ready = false; - spin_unlock(&queue->lock); - } - - return ready; -} - /* * fuse_uring_req_fetch command handling */ @@ -1163,13 +1181,9 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, spin_unlock(&queue->lock); if (!ring->ready) { - bool ready = is_ring_ready(ring, queue->qid); - - if (ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); } } @@ -1502,22 +1516,36 @@ static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) { unsigned int qid; - struct fuse_ring_queue *queue; + int node; + unsigned int nr_queues; + unsigned int cpu = task_cpu(current); - qid = task_cpu(current); + cpu = cpu % ring->max_nr_queues; - if (WARN_ONCE(qid >= ring->max_nr_queues, - "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->max_nr_queues)) - qid = 0; + /* numa local registered queue bitmap */ + node = cpu_to_node(cpu); + if (WARN_ONCE(node >= ring->nr_numa_nodes, + "Node number (%d) exceeds nr nodes (%d)\n", + node, ring->nr_numa_nodes)) { + node = 0; + } - queue = ring->queues[qid]; - WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); + if (nr_queues) { + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + return READ_ONCE(ring->queues[qid]); + } - return queue; + /* global registered queue bitmap */ + qid = ring->q_map.cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + return READ_ONCE(ring->queues[qid]); } static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) @@ -1557,7 +1585,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring); if (!queue) goto err; @@ -1601,7 +1629,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring); if (!queue) return false; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7d1f936f8a1ff2..7e19accd8f27f2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1557,8 +1557,7 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - flags = - FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -1570,8 +1569,9 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_INVAL_INODE_ENTRY | + FUSE_EXPIRE_INODE_ENTRY | FUSE_URING_REDUCED_Q | + FUSE_EXPIRE_INODE_ENTRY | FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index dd463d13585043..605c755c8c6331 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -450,6 +450,8 @@ struct fuse_file_lock { * init_out.request_timeout contains the timeout (in secs) * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation + * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free + * to register between 1 and nr-core io-uring queues */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -497,6 +499,8 @@ struct fuse_file_lock { #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) #define FUSE_REQUEST_TIMEOUT (1ULL << 42) +#define FUSE_ALIGN_PG_ORDER (1ULL << 50) +#define FUSE_URING_REDUCED_Q (1ULL << 59) #define FUSE_INVAL_INODE_ENTRY (1ULL << 60) #define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) From 1fe1b00840f41b8c66f18abae473290f6188e9a4 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 24 Sep 2025 19:14:19 +0200 Subject: [PATCH 22/46] fuse: {io-uring} Queue background requests on a different core Running background IO on a different core makes quite a difference. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=272MiB/s (285MB/s) ... patched READ: bw=650MiB/s (682MB/s) Reason is easily visible, the fio process is migrating between CPUs when requests are submitted on the queue for the same core. With --iodepth=8 unpatched READ: bw=466MiB/s (489MB/s) patched READ: bw=641MiB/s (672MB/s) Without io-uring (--iodepth=8) READ: bw=729MiB/s (764MB/s) Without fuse (--iodepth=8) READ: bw=2199MiB/s (2306MB/s) (Test were done with /example/passthrough_hp -o allow_other --nopassthrough \ [-o io_uring] /tmp/source /tmp/dest ) Additional notes: With FURING_NEXT_QUEUE_RETRIES=0 (--iodepth=8) READ: bw=903MiB/s (946MB/s) With just a random qid (--iodepth=8) READ: bw=429MiB/s (450MB/s) With --iodepth=1 unpatched READ: bw=195MiB/s (204MB/s) patched READ: bw=232MiB/s (243MB/s) With --iodepth=1 --numjobs=2 unpatched READ: bw=366MiB/s (384MB/s) patched READ: bw=472MiB/s (495MB/s) With --iodepth=1 --numjobs=8 unpatched READ: bw=1437MiB/s (1507MB/s) patched READ: bw=1529MiB/s (1603MB/s) fuse without io-uring READ: bw=1314MiB/s (1378MB/s), 1314MiB/s-1314MiB/s ... no-fuse READ: bw=2566MiB/s (2690MB/s), 2566MiB/s-2566MiB/s ... In summary, for async requests the core doing application IO is busy sending requests and processing IOs should be done on a different core. Spreading the load on random cores is also not desirable, as the core might be frequency scaled down and/or in C1 sleep states. Not shown here, but differnces are much smaller when the system uses performance govenor instead of schedutil (ubuntu default). Obviously at the cost of higher system power consumption for performance govenor - not desirable either. Results without io-uring (which uses fixed libfuse threads per queue) heavily depend on the current number of active threads. Libfuse uses default of max 10 threads, but actual nr max threads is a parameter. Also, no-fuse-io-uring results heavily depend on, if there was already running another workload before, as libfuse starts these threads dynamically - i.e. the more threads are active, the worse the performance. Signed-off-by: Bernd Schubert (imported from commit c6399ea79b104ac79758f2c36f1977b80a02358d) --- fs/fuse/dev_uring.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index b3927356be722a..e9dfff3c912851 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1516,13 +1516,21 @@ static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, + bool background) { unsigned int qid; int node; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + /* + * Background requests result in better performance on a different + * CPU, unless CPUs are already busy. + */ + if (background) + cpu++; + cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1585,7 +1593,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, false); if (!queue) goto err; @@ -1629,7 +1637,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_select_queue(ring); + queue = fuse_uring_select_queue(ring, true); if (!queue) return false; From 489b160a640a8570e1b9888f57d79dc4fd75793b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 24 Oct 2025 19:05:07 +0200 Subject: [PATCH 23/46] fuse: Add retry attempts for numa local queues for load distribution This is to further improve performance. fio --directory=/tmp/dest --name=iops.\$jobnum --rw=randread \ --bs=4k --size=1G --numjobs=1 --iodepth=4 --time_based\ --runtime=30s --group_reporting --ioengine=io_uring\ --direct=1 unpatched READ: bw=650MiB/s (682MB/s) patched: READ: bw=995MiB/s (1043MB/s) with --iodepth=8 unpatched READ: bw=641MiB/s (672MB/s) patched READ: bw=966MiB/s (1012MB/s) Reason is that with --iodepth=x (x > 1) fio submits multiple async requests and a single queue might become CPU limited. I.e. spreading the load helps. (imported from commit 2e73b0be1f55d61c2d861a12bf6bb9963b9b877a) --- fs/fuse/dev_uring.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index e9dfff3c912851..f99418cfa698dd 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -22,6 +22,8 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +#define FUSE_URING_Q_THRESHOLD 2 + bool fuse_uring_enabled(void) { @@ -1520,9 +1522,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node; + int node, retries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); + struct fuse_ring_queue *queue, *primary_queue = NULL; /* * Background requests result in better performance on a different @@ -1531,6 +1534,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, if (background) cpu++; +retry: cpu = cpu % ring->max_nr_queues; /* numa local registered queue bitmap */ @@ -1546,12 +1550,35 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - return READ_ONCE(ring->queues[qid]); + queue = READ_ONCE(ring->queues[qid]); + + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; + + if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) + return queue; + + /* Retries help for load balancing */ + if (retries < FUSE_URING_Q_THRESHOLD) { + if (!retries) + primary_queue = queue; + + /* Increase cpu, assuming it will map to a differet qid*/ + cpu++; + retries++; + goto retry; + } } + /* Retries exceeded, take the primary target queue */ + if (primary_queue) + return primary_queue; + /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + /* Might happen on teardown */ return NULL; return READ_ONCE(ring->queues[qid]); } From 7f65f763e84b5aa3d81a28839193cee68ba3fc29 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 10 Nov 2025 13:17:38 +0100 Subject: [PATCH 24/46] fuse: Fetch a queued fuse request on command registration With the reduced queue feature io-uring is marked as ready after receiving the 1st ring entry. At this time other queues just might be in the process of registration and then a race happens fuse_uring_queue_fuse_req -> no queue entry registered yet list_add_tail -> fuse request gets queued So far fetching requests from the list only happened from FUSE_IO_URING_CMD_COMMIT_AND_FETCH, but without new requests on the same queue, it would actually never send requests from that queue - the request was stuck. (imported from commit 3bfb6cdc9b978a13eab59ebae592ddfa225c4c4a) --- fs/fuse/dev_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f99418cfa698dd..f88cc4c94aba8e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1403,6 +1403,8 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; } From 73c9855ebfdbb89a77b9301b11d47f4d2a51ff27 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Tue, 16 Sep 2025 13:31:45 +0200 Subject: [PATCH 25/46] fuse: add compound command to combine multiple requests fuse.h: add new opcode FUSE_COMPOUND fuse_compound.c: add new functionality to pack multiple fuse operations into one compound command file.c: add an implementation of open+getattr Signed-off-by: Horst Birthelmer (imported from commit d9e735140a3faccbe5786a7e75a4ad9a6a9aa2e0) (imported from commit 1607a03696693c4ceef7a61adf5759748a7ca9b0) (imported from commit 9df5e4cb96184aae03d7d49131b59a4767641d6b) (imported from commit 9921bcdc4e126a7606e036b04893a6bfd36b8c75) (imported from commit 09d6f59e98090b4de35bfe5344fd1ca5559d1c16) (imported from commit 41b40bdc0739af60f3fbabb4dd45006f801ebd0d) --- fs/fuse/Makefile | 2 +- fs/fuse/compound.c | 263 ++++++++++++++++++++++++++++++++++++++ fs/fuse/dev.c | 24 ++++ fs/fuse/dir.c | 9 +- fs/fuse/file.c | 154 ++++++++++++++++++---- fs/fuse/fuse_i.h | 24 +++- fs/fuse/inode.c | 6 + fs/fuse/ioctl.c | 2 +- include/uapi/linux/fuse.h | 38 ++++++ 9 files changed, 487 insertions(+), 35 deletions(-) create mode 100644 fs/fuse/compound.c diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 64bc8682ae9659..2407870803000d 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := trace.o # put trace.o first so we see ftrace errors sooner -fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o compound.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c new file mode 100644 index 00000000000000..bc52e22eff3123 --- /dev/null +++ b/fs/fuse/compound.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2025 + * + * This file implements compound operations for FUSE, allowing multiple + * operations to be batched into a single request to reduce round trips + * between kernel and userspace. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Compound request builder and state tracker and args pointer storage + */ +struct fuse_compound_req { + struct fuse_mount *fm; + struct fuse_compound_in compound_header; + struct fuse_compound_out result_header; + + /* Per-operation error codes */ + int op_errors[FUSE_MAX_COMPOUND_OPS]; + struct fuse_args *op_args[FUSE_MAX_COMPOUND_OPS]; +}; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, u32 flags) +{ + struct fuse_compound_req *compound; + + compound = kzalloc(sizeof(*compound), GFP_KERNEL); + if (!compound) + return ERR_PTR(-ENOMEM); + + compound->fm = fm; + compound->compound_header.flags = flags; + + return compound; +} + +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args) +{ + if (!compound || + compound->compound_header.count >= FUSE_MAX_COMPOUND_OPS) + return -EINVAL; + + if (args->in_pages) + return -EINVAL; + + compound->op_args[compound->compound_header.count] = args; + compound->compound_header.count++; + return 0; +} + +static void *fuse_copy_response_per_req(struct fuse_args *args, + char *resp) +{ + int i; + size_t copied = 0; + + for (i = 0; i < args->out_numargs; i++) { + struct fuse_arg current_arg = args->out_args[i]; + size_t arg_size = current_arg.size; + + if (current_arg.value && arg_size > 0) { + memcpy(current_arg.value, + (char *)resp + copied, arg_size); + copied += arg_size; + } + } + + return (char *)resp + copied; +} + +int fuse_compound_get_error(struct fuse_compound_req *compound, int op_idx) +{ + return compound->op_errors[op_idx]; +} + +static void *fuse_compound_parse_one_op(struct fuse_compound_req *compound, + int op_index, void *op_out_data, + void *response_end) +{ + struct fuse_out_header *op_hdr = op_out_data; + struct fuse_args *args = compound->op_args[op_index]; + + if (op_hdr->len < sizeof(struct fuse_out_header)) + return NULL; + + /* Check if the entire operation response fits in the buffer */ + if ((char *)op_out_data + op_hdr->len > (char *)response_end) + return NULL; + + if (op_hdr->error != 0) + compound->op_errors[op_index] = op_hdr->error; + + if (args && op_hdr->len > sizeof(struct fuse_out_header)) + return fuse_copy_response_per_req(args, op_out_data + + sizeof(struct fuse_out_header)); + + /* No response data, just advance past the header */ + return (char *)op_out_data + op_hdr->len; +} + +static int fuse_compound_parse_resp(struct fuse_compound_req *compound, + u32 count, void *response, + size_t response_size) +{ + void *op_out_data = response; + void *response_end = (char *)response + response_size; + int i; + + if (!response || response_size < sizeof(struct fuse_out_header)) + return -EIO; + + for (i = 0; i < count && i < compound->result_header.count; i++) { + op_out_data = fuse_compound_parse_one_op(compound, i, + op_out_data, + response_end); + if (!op_out_data) + return -EIO; + } + + return 0; +} + +ssize_t fuse_compound_send(struct fuse_compound_req *compound) +{ + struct fuse_args args = { + .opcode = FUSE_COMPOUND, + .nodeid = 0, + .in_numargs = 2, + .out_numargs = 2, + .out_argvar = true, + }; + size_t resp_buffer_size; + size_t actual_response_size; + size_t buffer_pos; + size_t total_expected_out_size; + void *buffer = NULL; + void *resp_payload; + ssize_t ret; + int i; + + if (!compound) { + pr_info_ratelimited("FUSE: compound request is NULL in %s\n", + __func__); + return -EINVAL; + } + + if (compound->compound_header.count == 0) { + pr_info_ratelimited("FUSE: compound request contains no operations\n"); + return -EINVAL; + } + + buffer_pos = 0; + total_expected_out_size = 0; + + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + buffer_pos += needed_size; + + for (j = 0; j < op_args->out_numargs; j++) + total_expected_out_size += op_args->out_args[j].size; + } + + buffer = kvmalloc(buffer_pos, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + buffer_pos = 0; + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + struct fuse_in_header *hdr; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + hdr = (struct fuse_in_header *)(buffer + buffer_pos); + memset(hdr, 0, sizeof(*hdr)); + hdr->len = needed_size; + hdr->opcode = op_args->opcode; + hdr->nodeid = op_args->nodeid; + hdr->uid = from_kuid(compound->fm->fc->user_ns, + current_fsuid()); + hdr->gid = from_kgid(compound->fm->fc->user_ns, + current_fsgid()); + hdr->pid = pid_nr_ns(task_pid(current), + compound->fm->fc->pid_ns); + buffer_pos += sizeof(*hdr); + + for (j = 0; j < op_args->in_numargs; j++) { + memcpy(buffer + buffer_pos, op_args->in_args[j].value, + op_args->in_args[j].size); + buffer_pos += op_args->in_args[j].size; + } + } + + resp_buffer_size = total_expected_out_size + + (compound->compound_header.count * + sizeof(struct fuse_out_header)); + + resp_payload = kvmalloc(resp_buffer_size, GFP_KERNEL | __GFP_ZERO); + if (!resp_payload) { + ret = -ENOMEM; + goto out_free_buffer; + } + + compound->compound_header.result_size = total_expected_out_size; + + args.in_args[0].size = sizeof(compound->compound_header); + args.in_args[0].value = &compound->compound_header; + args.in_args[1].size = buffer_pos; + args.in_args[1].value = buffer; + + args.out_args[0].size = sizeof(compound->result_header); + args.out_args[0].value = &compound->result_header; + args.out_args[1].size = resp_buffer_size; + args.out_args[1].value = resp_payload; + + ret = fuse_simple_request(compound->fm, &args); + if (ret < 0) + goto out; + + actual_response_size = args.out_args[1].size; + + if (actual_response_size < sizeof(struct fuse_compound_out)) { + pr_info_ratelimited("FUSE: compound response too small (%zu bytes, minimum %zu bytes)\n", + actual_response_size, + sizeof(struct fuse_compound_out)); + ret = -EINVAL; + goto out; + } + + ret = fuse_compound_parse_resp(compound, compound->result_header.count, + (char *)resp_payload, + actual_response_size); +out: + kvfree(resp_payload); +out_free_buffer: + kvfree(buffer); + return ret; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d5e62f132e1dba..703c56e5f63a3e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -663,6 +663,30 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } +ssize_t fuse_compound_request(struct fuse_mount *fm, struct fuse_args *args) +{ + struct fuse_req *req; + ssize_t ret; + + req = fuse_get_req(&invalid_mnt_idmap, fm, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_args_to_req(req, args); + + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + + __fuse_request_send(req); + ret = req->out.h.error; + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + ret = args->out_args[args->out_numargs - 1].size; + } + fuse_put_request(req); + return ret; +} + ssize_t __fuse_simple_request(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_args *args) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c1179ce8fc96b2..d67858330bd1f6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1493,14 +1493,7 @@ static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.opcode = FUSE_GETATTR; - args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + fuse_getattr_args_fill(&args, get_node_id(inode), &inarg, &outarg); err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aac4702b08cd77..5f331e6136931b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -24,6 +24,39 @@ #include #include +/* + * Helper function to initialize fuse_args for OPEN/OPENDIR operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg) +{ + args->opcode = opcode; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + +/* + * Helper function to initialize fuse_args for GETATTR operations + */ +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg) +{ + args->opcode = FUSE_GETATTR; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, struct fuse_open_out *outargp) @@ -41,14 +74,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = opcode; - args.nodeid = nodeid; - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(*outargp); - args.out_args[0].value = outargp; + fuse_open_args_fill(&args, nodeid, opcode, &inarg, outargp); return fuse_simple_request(fm, &args); } @@ -127,8 +153,66 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) } } +static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, + int flags, int opcode, + struct fuse_file *ff, + struct fuse_attr_out *outattrp, + struct fuse_open_out *outopenp) +{ + struct fuse_compound_req *compound; + struct fuse_args open_args = {}; + struct fuse_args getattr_args = {}; + struct fuse_open_in open_in = {}; + struct fuse_getattr_in getattr_in = {}; + int err; + + compound = fuse_compound_alloc(fm, 0); + if (IS_ERR(compound)) + return PTR_ERR(compound); + + open_in.flags = flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) + open_in.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (open_in.flags & O_TRUNC) && !capable(CAP_FSETID)) + open_in.open_flags |= FUSE_OPEN_KILL_SUIDGID; + + fuse_open_args_fill(&open_args, nodeid, opcode, &open_in, outopenp); + + err = fuse_compound_add(compound, &open_args); + if (err) + goto out; + + fuse_getattr_args_fill(&getattr_args, nodeid, &getattr_in, outattrp); + + err = fuse_compound_add(compound, &getattr_args); + if (err) + goto out; + + err = fuse_compound_send(compound); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 0); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 1); + if (err) + goto out; + + ff->fh = outopenp->fh; + ff->open_flags = outopenp->open_flags; + +out: + kfree(compound); + return err; +} + struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir) + struct inode *inode, + unsigned int open_flags, bool isdir) { struct fuse_conn *fc = fm->fc; struct fuse_file *ff; @@ -154,23 +238,46 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, if (open) { /* Store outarg for fuse_finish_open() */ struct fuse_open_out *outargp = &ff->args->open_outarg; - int err; + int err = -ENOSYS; + + if (inode && fc->compound_open_getattr) { + + struct fuse_attr_out attr_outarg; + + err = fuse_compound_open_getattr(fm, nodeid, open_flags, + opcode, ff, + &attr_outarg, outargp); + if (err == -ENOSYS) + fc->compound_open_getattr = 0; + if (!err) + fuse_change_attributes(inode, &attr_outarg.attr, + NULL, + ATTR_TIMEOUT(&attr_outarg), + fuse_get_attr_version(fc)); + } + if (err == -ENOSYS) { + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); + if (!err) { + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; + } + } - err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); - if (!err) { - ff->fh = outargp->fh; - ff->open_flags = outargp->open_flags; - } else if (err != -ENOSYS) { - fuse_file_free(ff); - return ERR_PTR(err); - } else { - if (isdir) { + if (err) { + if (err != -ENOSYS) { + /* err is not ENOSYS */ + fuse_file_free(ff); + return ERR_PTR(err); + } else { /* No release needed */ kfree(ff->args); ff->args = NULL; - fc->no_opendir = 1; - } else { - fc->no_open = 1; + + /* we don't have open */ + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } } @@ -186,11 +293,10 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { - struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + struct fuse_file *ff = fuse_file_open(fm, nodeid, file_inode(file), file->f_flags, isdir); if (!IS_ERR(ff)) file->private_data = ff; - return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index dbefbcf3c14d5f..f982875b6b0ed8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -948,6 +948,8 @@ struct fuse_conn { /* Use io_uring for communication */ unsigned int io_uring; + /* Does the filesystem support compound operations? */ + unsigned int compound_open_getattr:1; /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -1203,6 +1205,14 @@ struct fuse_io_args { void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); +/* + * Helper functions to initialize fuse_args for common operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg); +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg); struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); @@ -1294,6 +1304,8 @@ static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, return __fuse_simple_request(idmap, fm, args); } +ssize_t fuse_compound_request(struct fuse_mount *fm, struct fuse_args *args); + int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); @@ -1301,6 +1313,14 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, * Assign a unique id to a fuse request */ void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req); +struct fuse_compound_req; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, uint32_t flags); +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args); +ssize_t fuse_compound_send(struct fuse_compound_req *compound); +int fuse_compound_get_error(struct fuse_compound_req * compound, + int op_idx); /** * End a finished request @@ -1573,7 +1593,9 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); /* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir); + struct inode *inode, + unsigned int open_flags, + bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7e19accd8f27f2..56faa9dd278f01 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1049,6 +1049,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; fc->dlm = 1; + + /* pretend fuse server supports compound operations + * until it tells us otherwise. + */ + fc->compound_open_getattr = 1; + atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fdc175e93f7474..07a02e47b2c3a6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -494,7 +494,7 @@ static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); - return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); + return fuse_file_open(fm, get_node_id(inode), NULL, O_RDONLY, isdir); } static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 605c755c8c6331..30bb854fbc9408 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -675,6 +675,13 @@ enum fuse_opcode { /* Operations which have not been merged into upstream */ FUSE_DLM_WB_LOCK = 100, + /* A compound request works like multiple simple requests. + * This is a special case for calls that can be combined atomic on the + * fuse server. If the server actually does atomically execute the command is + * left to the fuse server implementation. + */ + FUSE_COMPOUND = 101, + /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1281,6 +1288,7 @@ struct fuse_dlm_lock_in { uint32_t reserved; }; + /** * struct fuse_dlm_lock_out - Lock response * @locksize: how many bytes where locked by the call @@ -1293,6 +1301,36 @@ struct fuse_dlm_lock_out { uint64_t reserved; }; +/* + * Compound request header + * + * This header is followed by the fuse requests + */ +struct fuse_compound_in { + uint32_t count; /* Number of operations */ + uint32_t flags; /* Compound flags */ + + /* Total size of all results. + * This is needed for preallocating the whole result for all + * commands in this compound. + */ + uint32_t result_size; + uint64_t reserved; +}; + +/* + * Compound response header + * + * This header is followed by complete fuse responses + */ +struct fuse_compound_out { + uint32_t count; /* Number of results */ + uint32_t flags; /* Result flags */ + uint64_t reserved; +}; + +#define FUSE_MAX_COMPOUND_OPS 16 /* Maximum operations per compound */ + /** * Size of the ring buffer header */ From 901deb597a8ac7bd1c13029f7219eea2d627ff17 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 12 Dec 2025 14:13:10 +0100 Subject: [PATCH 26/46] RED-34640: Fix a startup teardown race There was a race between fuse_uring_cancel() and fuse_uring_register()/fuse_uring_next_fuse_req(), which comes from the queue reduction feature. Race was core-A core-B fuse_uring_register spin_lock(&queue->lock); fuse_uring_ent_avail() spin_unlock(&queue->lock); fuse_uring_cancel() spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; list_move() fuse_uring_next_fuse_req() spin_lock(&queue->lock); fuse_uring_ent_avail(ent, queue); fuse_uring_send_next_to_ring() spin_unlock(&queue->lock); fuse_uring_send_next_to_ring I.e. fuse_uring_ent_avail() was called two times and the 2nd time when the entry was actually already handled by fuse_uring_cancel(). Solution is to not call fuse_uring_ent_avail() from fuse_uring_register. With that the entry is not in state FRRS_AVAILABLE and fuse_uring_cancel() will not touch it. fuse_uring_send_next_to_ring() will mark it as FRRS_AVAILABLE, and then either assign a request to it and change state again or will not touch it at all anymore - race fixed. This will be folded into the upstream queue reduction patches and therefore has the RED-34640 commit message. Also entirely removed is fuse_uring_do_register() as remaining work can be done by the caller. Signed-off-by: Bernd Schubert (imported from commit 932febaee72bfc10a391cdfa14a2b7f37549d967) --- fs/fuse/dev_uring.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f88cc4c94aba8e..52882280763ab9 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1159,36 +1159,6 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -/* - * fuse_uring_req_fetch command handling - */ -static void fuse_uring_do_register(struct fuse_ring_ent *ent, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - struct fuse_ring_queue *queue = ent->queue; - struct fuse_ring *ring = queue->ring; - struct fuse_conn *fc = ring->fc; - struct fuse_iqueue *fiq = &fc->iq; - int node = cpu_to_node(queue->qid); - - if (WARN_ON_ONCE(node >= ring->nr_numa_nodes)) - node = 0; - - fuse_uring_prepare_cancel(cmd, issue_flags, ent); - - spin_lock(&queue->lock); - ent->cmd = cmd; - fuse_uring_ent_avail(ent, queue); - spin_unlock(&queue->lock); - - if (!ring->ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } -} - /* * Copy from memmap.c, should be exported there */ @@ -1370,6 +1340,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; + struct fuse_iqueue *fiq = &fc->iq; int err; unsigned int qid = READ_ONCE(cmd_req->qid); @@ -1401,8 +1372,18 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, if (IS_ERR(ent)) return PTR_ERR(ent); - fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + if (!ring->ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + + spin_lock(&queue->lock); + ent->cmd = cmd; + spin_unlock(&queue->lock); + /* Marks the ring entry as ready */ fuse_uring_next_fuse_req(ent, queue, issue_flags); return 0; From 9bb2806ee2c557743f1ccbec45b00d662997bb52 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Oct 2025 23:17:15 +0200 Subject: [PATCH 27/46] fuse: Move ring queues_refs decrement This is just to avoid code dup with an upcoming commit. Signed-off-by: Bernd Schubert (imported from commit ec3217f655d816ac9e3e29b1dc1506d7b195a0a5) --- fs/fuse/dev_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 52882280763ab9..a6d3ecabcf632c 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -458,7 +458,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) { struct fuse_req *req; struct io_uring_cmd *cmd; - + ssize_t queue_refs; struct fuse_ring_queue *queue = ent->queue; spin_lock(&queue->lock); @@ -486,15 +486,16 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) if (req) fuse_uring_stop_fuse_req_end(req); + + queue_refs = atomic_dec_return(&queue->ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); } static void fuse_uring_stop_list_entries(struct list_head *head, struct fuse_ring_queue *queue, enum fuse_ring_req_state exp_state) { - struct fuse_ring *ring = queue->ring; struct fuse_ring_ent *ent, *next; - ssize_t queue_refs = SSIZE_MAX; LIST_HEAD(to_teardown); spin_lock(&queue->lock); @@ -511,11 +512,8 @@ static void fuse_uring_stop_list_entries(struct list_head *head, spin_unlock(&queue->lock); /* no queue lock to avoid lock order issues */ - list_for_each_entry_safe(ent, next, &to_teardown, list) { + list_for_each_entry_safe(ent, next, &to_teardown, list) fuse_uring_entry_teardown(ent); - queue_refs = atomic_dec_return(&ring->queue_refs); - WARN_ON_ONCE(queue_refs < 0); - } } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) From c915d7f04573fdd96d375eb3f2dba201e36136c9 Mon Sep 17 00:00:00 2001 From: Jian Huang Li Date: Mon, 20 Oct 2025 23:23:11 +0200 Subject: [PATCH 28/46] fs/fuse: fix potential memory leak from fuse_uring_cancel This issue could be observed sometimes during libfuse xfstests, from dmseg prints some like "kernel: WARNING: CPU: 4 PID: 0 at fs/fuse/dev_uring.c:204 fuse_uring_destruct+0x1f5/0x200 [fuse]". The cause is, if when fuse daemon just submitted FUSE_IO_URING_CMD_REGISTER SQEs, then umount or fuse daemon quits at this very early stage. After all uring queues stopped, might have one or more unprocessed FUSE_IO_URING_CMD_REGISTER SQEs get processed then some new ring entities are created and added to ent_avail_queue, and immediately fuse_uring_cancel moved them to ent_in_userspace after SQEs get canceled. These ring entities were not moved to ent_released, and stayed in ent_in_userspace when fuse_uring_destruct was called. One way to solve it would be to also free 'ent_in_userspace' in fuse_uring_destruct(), but from code point of view it is hard to see why it is needed. As suggested by Joanne, another solution is to avoid moving entries in fuse_uring_cancel() to the 'ent_in_userspace' list and just releasing them directly. Fixes: b6236c8407cb ("fuse: {io-uring} Prevent mount point hang on fuse-server termination") Cc: Joanne Koong Cc: # v6.14 Signed-off-by: Jian Huang Li Signed-off-by: Bernd Schubert (imported from commit 30d0473dcc0eecac6b1e00d9d87b0892146086a9) --- debian/scripts/misc/kconfig/__init__.py | 0 fs/fuse/dev_uring.c | 21 +++++++++------------ 2 files changed, 9 insertions(+), 12 deletions(-) delete mode 100644 debian/scripts/misc/kconfig/__init__.py diff --git a/debian/scripts/misc/kconfig/__init__.py b/debian/scripts/misc/kconfig/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index a6d3ecabcf632c..197333cf53042a 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -454,7 +454,7 @@ static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) /* * Release a request/entry on connection tear down */ -static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent, int issue_flags) { struct fuse_req *req; struct io_uring_cmd *cmd; @@ -482,7 +482,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); if (req) fuse_uring_stop_fuse_req_end(req); @@ -513,7 +513,7 @@ static void fuse_uring_stop_list_entries(struct list_head *head, /* no queue lock to avoid lock order issues */ list_for_each_entry_safe(ent, next, &to_teardown, list) - fuse_uring_entry_teardown(ent); + fuse_uring_entry_teardown(ent, IO_URING_F_UNLOCKED); } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) @@ -639,7 +639,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, { struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue; - bool need_cmd_done = false; + bool teardown = false; /* * direct access on ent - it must not be destructed as long as @@ -648,17 +648,14 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, queue = ent->queue; spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - need_cmd_done = true; - ent->cmd = NULL; + ent->state = FRRS_TEARDOWN; + list_del_init(&ent->list); + teardown = true; } spin_unlock(&queue->lock); - if (need_cmd_done) { - /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); - } + if (teardown) + fuse_uring_entry_teardown(ent, issue_flags); } static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, From 9b49db7dd146f7513c6f601b2e8558ee92ad66cb Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sun, 23 Nov 2025 17:43:40 +0100 Subject: [PATCH 29/46] fuse: Fix missing numa_q_map free in dev_uring This fixes a memory leak. (imported from commit f75b62fce0e6689b1cc57bdae4b6a93be1ca2168) --- fs/fuse/dev_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 197333cf53042a..be64945ce64584 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -178,9 +178,11 @@ static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) fuse_ring_destruct_q_map(&ring->q_map); - if (ring->numa_q_map) + if (ring->numa_q_map) { for (node = 0; node < ring->nr_numa_nodes; node++) fuse_ring_destruct_q_map(&ring->numa_q_map[node]); + kfree(ring->numa_q_map); + } } static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) From e32fd2522de2937b090904f80f78f4deefdb8721 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Fri, 19 Dec 2025 10:04:50 +0100 Subject: [PATCH 30/46] fuse: fix includes no functional changes Signed-off-by: Horst Birthelmer (imported from commit f0bccb2ea093d8bf703d535d34541b3000ec1d86) --- fs/fuse/compound.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c index bc52e22eff3123..5d84e3558a06f8 100644 --- a/fs/fuse/compound.c +++ b/fs/fuse/compound.c @@ -10,18 +10,6 @@ #include "fuse_i.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - /* * Compound request builder and state tracker and args pointer storage */ From 1ae938aab8800cd7120a7fdf38d98ab81197cc06 Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Tue, 30 Sep 2025 03:00:46 +0800 Subject: [PATCH 31/46] Create workflow the create pr for redfs in each branch Take actions on the PR merged event of this repo. Run copy-from-linux-branch.sh and create a PR for redfs. (cherry picked from commit f54872e99c6ebccc92c202e15c22eb68c26b10f6) (imported from commit 522fddfe975a361a411b853eb6b40c62e35ad39e) --- .github/workflows/create-redfs-pr.yml | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 .github/workflows/create-redfs-pr.yml diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml new file mode 100644 index 00000000000000..cd7e9717440c4e --- /dev/null +++ b/.github/workflows/create-redfs-pr.yml @@ -0,0 +1,92 @@ +# Automatially run copy-from-linux-branch.sh on branches and create PR for redfs. +name: Sync to redfs repo +on: + # Triggers the workflow on pull request merged. + pull_request: + branches: [ "*" ] + types: [ "closed" ] + +jobs: + create-redfs-pr: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + # Checks-out to a different directory to avoid following checkout removing it. + - uses: actions/checkout@v4 + with: + path: linux + + - name: Try to checkout sync-${{ github.ref_name }} if it exists + uses: actions/checkout@v4 + id: try-checkout + continue-on-error: true + with: + repository: DDNStorage/redfs + ref: sync-${{ github.ref_name }} + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Fallback to checkout main + if: steps.try-checkout.outcome == 'failure' + uses: actions/checkout@v4 + with: + repository: DDNStorage/redfs + ref: main + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Initialize git + run: | + git config --global user.name "DDNStorage RED Workflow" + git config --global user.email "red@ddn.com" + + - name: Create tracking branch based on main + if: steps.try-checkout.outcome == 'failure' + run: | + pushd redfs + git checkout -b sync-${{ github.ref_name }} + popd + + - name: Generate PR for redfs + run: | + declare -A MAP + MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.26.1.el9_6" + MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" + kerver=${MAP["${{ github.ref_name }}"]} + if [ -z ${kerver} ]; then + echo "Cannot find target kernel version" + exit 1 + fi + pushd redfs + ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} + git add src/$kerver + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} \n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg + echo -e "${{ github.sha }}" >> ../commit.msg + RET=0 + git commit -F ../commit.msg 2> ../commit.log || RET=$?; + if [ -s ../commit.log ]; then + echo "Error detcted in commit:" + cat ../commit.log + exit 1 + elif [ $RET -eq 0 ]; then + echo "Done. Push the code to remote:" + git push origin sync-${{ github.ref_name }} 2> ../push.log ||: + else + echo "No changes to existed codes. Still try with PR." + fi + if [ -s ../push.log ]; then + echo "Error detected in push:" + cat ../push.log + fi + gh pr create --base main --fill || RET=$? + if [ $RET -eq 1 ]; then + echo "No pending changes for PR, returning $RET." + fi + popd + env: + GH_TOKEN: ${{ secrets.OPENUNIXPAT }} + From c4d803777e1472a25efd7d21282df1799f9a98f5 Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Tue, 30 Dec 2025 08:56:41 +0800 Subject: [PATCH 32/46] Fix the github actions PR trigger Switch to pull_request_target instead of pull_request as the github security requirement. Also limits the scope to protected PR. (cherry picked from commit b9980ad9af3598d465c72fb92f565415c8d4a006) (imported from commit e504e4a44abfa9cef941189e229cef0412c3f014) --- .github/workflows/create-redfs-pr.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml index cd7e9717440c4e..1f7b99a60c7aff 100644 --- a/.github/workflows/create-redfs-pr.yml +++ b/.github/workflows/create-redfs-pr.yml @@ -3,7 +3,10 @@ name: Sync to redfs repo on: # Triggers the workflow on pull request merged. pull_request: - branches: [ "*" ] + branches: [ "redfs-*" ] + types: [ "closed" ] + pull_request_target: + branches: [ "redfs-*" ] types: [ "closed" ] jobs: @@ -52,8 +55,9 @@ jobs: - name: Generate PR for redfs run: | declare -A MAP + MAP["redfs-rhel9_4-427.42.1"]="5.14.0-427.42.1.el9_4" MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" - MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.26.1.el9_6" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.12.1.el9_6" MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" kerver=${MAP["${{ github.ref_name }}"]} if [ -z ${kerver} ]; then @@ -63,7 +67,7 @@ jobs: pushd redfs ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} git add src/$kerver - echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} \n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }}\n" > ../commit.msg echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg echo -e "${{ github.sha }}" >> ../commit.msg RET=0 @@ -79,7 +83,7 @@ jobs: echo "No changes to existed codes. Still try with PR." fi if [ -s ../push.log ]; then - echo "Error detected in push:" + echo "Message detected in push:" cat ../push.log fi gh pr create --base main --fill || RET=$? @@ -88,5 +92,5 @@ jobs: fi popd env: - GH_TOKEN: ${{ secrets.OPENUNIXPAT }} + GH_TOKEN: ${{ secrets.REDFS_TOKEN }} From 27baabd0a9f09a1f2e27f0856ee3618f92920b68 Mon Sep 17 00:00:00 2001 From: Shuo Feng Date: Tue, 30 Dec 2025 11:18:10 +0800 Subject: [PATCH 33/46] Remove the pull_request_target from actions Remove the pull_request_target as it doesn't work. (cherry picked from commit 5328f660acf48ef3cf1f00ab8ae486aedf6874ee) (imported from commit 5277386783667357873cdd2819517b301a4b5063) --- .github/workflows/create-redfs-pr.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml index 1f7b99a60c7aff..cc03d7e1219e9b 100644 --- a/.github/workflows/create-redfs-pr.yml +++ b/.github/workflows/create-redfs-pr.yml @@ -5,9 +5,6 @@ on: pull_request: branches: [ "redfs-*" ] types: [ "closed" ] - pull_request_target: - branches: [ "redfs-*" ] - types: [ "closed" ] jobs: create-redfs-pr: From b2263460189dd85654051f2f506ebc65c75000f8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 13 Jan 2026 17:58:23 +0100 Subject: [PATCH 34/46] fuse: Make compounds a module option For now compounds are a module option and disabled by default Signed-off-by: Bernd Schubert (imported from commit f3b301ddccefec9e6363bb14e307c51462c0cc6a) --- fs/fuse/inode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 56faa9dd278f01..08c14478c38328 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -33,6 +33,10 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); +static bool __read_mostly enable_compound; +module_param(enable_compound, bool, 0644); +MODULE_PARM_DESC(enable_uring, "Enable fuse compounds"); + static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); @@ -1050,10 +1054,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->connected = 1; fc->dlm = 1; - /* pretend fuse server supports compound operations - * until it tells us otherwise. - */ - fc->compound_open_getattr = 1; + /* module option for now */ + fc->compound_open_getattr = enable_compound; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); From 026345bd6ca2172f3baecb48d989fbcf48962c65 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 4 Feb 2026 18:47:37 +0100 Subject: [PATCH 35/46] fuse: Fix the reduced queue assignment The use of bitmap_weight() didn't give the actual index, but always returned the current cpu, which resulted in a totally wrong mapping. It now just increases a counter for every mapping and ignores cores not in the given (numa) map and then find the index for that. Also added is a pr_debug(), which can be activated for example with echo "module redfs +p" >/proc/dynamic_debug/control (Pity that upstream is not open for such debug messages). (imported from commit bcbb684ad26c86cc77c04fdab1584ff1ed6bc270) --- fs/fuse/dev_uring.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index be64945ce64584..8254d2d8ff80ba 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -358,26 +358,25 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) } static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, - struct fuse_queue_map *q_map) + struct fuse_queue_map *q_map, + int node) { - int cpu, qid_idx; + int cpu, qid_idx, mapping_count = 0; size_t nr_queues; cpumask_set_cpu(qid, q_map->registered_q_mask); nr_queues = cpumask_weight(q_map->registered_q_mask); for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { - if (!q_map->cpu_to_qid) - return; - - /* - * Position of this CPU within the registered queue mask, - * handles non-contiguous CPU distributions across NUMA nodes. - */ - qid_idx = bitmap_weight( - cpumask_bits(q_map->registered_q_mask), cpu); + if (node != -1 && cpu_to_node(cpu) != node) + continue; - q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx % nr_queues, + qid_idx = mapping_count % nr_queues; + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx, q_map->registered_q_mask); + mapping_count++; + pr_debug("%s node=%d qid=%d qid_idx=%d nr_queues=%zu %d->%d\n", + __func__, node, qid, qid_idx, nr_queues, cpu, + q_map->cpu_to_qid[cpu]); } } @@ -428,7 +427,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, /* Static mapping from cpu to per numa queues */ node = cpu_to_node(qid); - fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node]); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node], node); /* * smp_store_release, as the variable is read without fc->lock and @@ -439,7 +438,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, ring->numa_q_map[node].nr_queues + 1); /* global mapping */ - fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map, -1); spin_unlock(&fc->lock); From 8e7e5b9ae6c8206dcd8acd06dfa02ee1815c440b Mon Sep 17 00:00:00 2001 From: Feng Shuo Date: Fri, 26 Dec 2025 23:34:06 +0800 Subject: [PATCH 36/46] Fix the compiling error on aarch64 Fix the include sequence which causes a compiling error on aarch64. (imported from commit f5fed0e3f4ad6f98427baa53f5e7505df831dd81) --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d67858330bd1f6..db3d2a737a7e0e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -6,8 +6,8 @@ See the file COPYING. */ -#include "fuse_dlm_cache.h" #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include From ce5b16a788ee6b56668e07faa19de2b55d528499 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 11 Feb 2026 16:38:21 +0100 Subject: [PATCH 37/46] fuse: {io-uring} Prefer the current core over mapping Mapping might point to a totally different core due to random assignment. For performance using the current core might be beneficial Example (with core binding) unpatched WRITE: bw=841MiB/s patched WRITE: bw=1363MiB/s With fio --name=test --ioengine=psync --direct=1 \ --rw=write --bs=1M --iodepth=1 --numjobs=1 \ --filename_format=/redfs/testfile.\$jobnum --size=100G \ --thread --create_on_open=1 --runtime=30s --cpus_allowed=1 In order to get the good number `--cpus_allowed=1` is needed. This could be improved by a future change that avoids cpu migration in fuse_request_end() on wake_up() call. (imported from commit 32e0073d67cfc7bd602dc7675ae71fa825b04362) --- fs/fuse/dev_uring.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 8254d2d8ff80ba..9c8f7f38193796 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -22,8 +22,12 @@ MODULE_PARM_DESC(enable_uring, #define FUSE_RING_HEADER_PG 0 #define FUSE_RING_PAYLOAD_PG 1 +/* Threshold that determines if a better queue should be searched for */ #define FUSE_URING_Q_THRESHOLD 2 +/* Number of (re)tries to find a better queue */ +#define FUSE_URING_Q_TRIES 3 + bool fuse_uring_enabled(void) { @@ -1501,7 +1505,7 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, bool background) { unsigned int qid; - int node, retries = 0; + int node, tries = 0; unsigned int nr_queues; unsigned int cpu = task_cpu(current); struct fuse_ring_queue *queue, *primary_queue = NULL; @@ -1526,26 +1530,36 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); if (nr_queues) { + /* prefer the queue that corresponds to the current cpu */ + queue = READ_ONCE(ring->queues[cpu]); + if (queue) { + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + primary_queue = queue; + } + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) return NULL; - queue = READ_ONCE(ring->queues[qid]); + if (qid != cpu) { + queue = READ_ONCE(ring->queues[qid]); - /* Might happen on teardown */ - if (unlikely(!queue)) - return NULL; + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; - if (queue->nr_reqs < FUSE_URING_Q_THRESHOLD) - return queue; + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + } /* Retries help for load balancing */ - if (retries < FUSE_URING_Q_THRESHOLD) { - if (!retries) + if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) { + if (!primary_queue) primary_queue = queue; - /* Increase cpu, assuming it will map to a differet qid*/ + /* Increase cpu, assuming it will map to a different qid*/ cpu++; - retries++; + tries++; goto retry; } } @@ -1556,9 +1570,10 @@ static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, /* global registered queue bitmap */ qid = ring->q_map.cpu_to_qid[cpu]; - if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) - /* Might happen on teardown */ + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) { + /* Might happen on teardown */ return NULL; + } return READ_ONCE(ring->queues[qid]); } From 2e4905aa111ff1b8b2f1e3fca08c4c383b21c10e Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 23 Feb 2026 16:58:19 +0100 Subject: [PATCH 38/46] fuse: enable large folios in inode initialization Add a module parameter to enable large folio support. Signed-off-by: Horst Birthelmer (imported from commit 475371c422ded852784924f998db8c181a077180) --- fs/fuse/file.c | 3 +++ fs/fuse/fuse_i.h | 2 ++ fs/fuse/inode.c | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5f331e6136931b..f24be6fb886461 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3400,4 +3400,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); + + if (enable_large_folios) + mapping_set_large_folios(inode->i_mapping); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f982875b6b0ed8..1e14a1102e3c01 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -84,6 +84,7 @@ extern struct mutex fuse_mutex; /** Module parameters */ extern unsigned int max_user_bgreq; extern unsigned int max_user_congthresh; +extern bool enable_large_folios; /* One forget request */ struct fuse_forget_link { @@ -948,6 +949,7 @@ struct fuse_conn { /* Use io_uring for communication */ unsigned int io_uring; + /* Does the filesystem support compound operations? */ unsigned int compound_open_getattr:1; /** Maximum stack depth for passthrough backing files */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 08c14478c38328..b32ea012d37395 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -37,6 +37,10 @@ static bool __read_mostly enable_compound; module_param(enable_compound, bool, 0644); MODULE_PARM_DESC(enable_uring, "Enable fuse compounds"); +static bool __read_mostly enable_large_folios = true; +module_param(enable_large_folios, bool, 0644); +MODULE_PARM_DESC(enable_large_folios, "Enable large folios support"); + static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); From 0b7039affdfaaef576203857d231402501f5e6a0 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 6 Mar 2026 19:50:14 +0100 Subject: [PATCH 39/46] fuse: Remove double define of 'enable_large_folios' module param compilation failed, due to external and static. The extern is actually not needed, static is enough. (imported from commit b2af4bd09d3b91dd0b95db4513bc7e291eb26661) (imported from commit 5df77fbe17cbda47647cbba82400fe31042bc104) --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b32ea012d37395..24db2d8af82d3e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -37,7 +37,7 @@ static bool __read_mostly enable_compound; module_param(enable_compound, bool, 0644); MODULE_PARM_DESC(enable_uring, "Enable fuse compounds"); -static bool __read_mostly enable_large_folios = true; +bool __read_mostly enable_large_folios = true; module_param(enable_large_folios, bool, 0644); MODULE_PARM_DESC(enable_large_folios, "Enable large folios support"); From 9ebffb0be731668ecda47860df5622b153976e53 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 9 Mar 2026 21:39:09 +0100 Subject: [PATCH 40/46] fuse: Remove unlock_request/lock_request This is a DDN patch only, as unlock_request()/lock_request() solve a deadlock issue for specially designed file systems, see Documentation/filesystems/fuse.rst, in the section **Scenario 2 - Tricky deadlock** This one needs a carefully crafted filesystem. It's a variation on the above, only the call back to the filesystem is not explicit, but is caused by a pagefault. :: | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2 In redfsd we do our best to not cause any kind of user issues and just want to be as fast as possible. Hence, we do not need the per page unlock/lock checks. Given that fuse is a generic file system, this can be a DDN commit only for now, until we find a better generic solution. The unlock_request/lock_request functions have been replaced by check_req_aborted(), which is run once per copied argument. (imported from commit dc7fa1cd35a0cf0caaca30c35ef09714f5fd3646) --- fs/fuse/dev.c | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 703c56e5f63a3e..1c355999cf9b31 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -829,38 +829,14 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, return 0; } -/* - * Lock the request. Up to the next unlock_request() there mustn't be - * anything that could cause a page-fault. If the request was already - * aborted bail out. - */ -static int lock_request(struct fuse_req *req) -{ - int err = 0; - if (req) { - spin_lock(&req->waitq.lock); - if (test_bit(FR_ABORTED, &req->flags)) - err = -ENOENT; - else - set_bit(FR_LOCKED, &req->flags); - spin_unlock(&req->waitq.lock); - } - return err; -} -/* - * Unlock request. If it was aborted while locked, caller is responsible - * for unlocking and ending the request. - */ -static int unlock_request(struct fuse_req *req) +static int check_req_aborted(struct fuse_req *req) { int err = 0; - if (req) { + if (req && test_bit(FR_ABORTED, &req->flags)) { spin_lock(&req->waitq.lock); if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; - else - clear_bit(FR_LOCKED, &req->flags); spin_unlock(&req->waitq.lock); } return err; @@ -902,7 +878,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct page *page; int err; - err = unlock_request(cs->req); + err = check_req_aborted(cs->req); if (err) return err; @@ -961,7 +937,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pg = page; } - return lock_request(cs->req); + return 0; } /* Do as much copy to/from userspace buffer as we can */ @@ -1022,9 +998,6 @@ static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop struct pipe_buffer *buf = cs->pipebufs; folio_get(oldfolio); - err = unlock_request(cs->req); - if (err) - goto out_put_old; fuse_copy_finish(cs); @@ -1110,9 +1083,7 @@ static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop cs->pg = buf->page; cs->offset = buf->offset; - err = lock_request(cs->req); - if (!err) - err = 1; + err = 1; goto out_put_old; } @@ -1121,17 +1092,11 @@ static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, unsigned offset, unsigned count) { struct pipe_buffer *buf; - int err; if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; folio_get(folio); - err = unlock_request(cs->req); - if (err) { - folio_put(folio); - return err; - } fuse_copy_finish(cs); From 09e3c7b1ab98055f2e59162855a307a3303f6d5b Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Fri, 16 Jan 2026 13:31:07 +0100 Subject: [PATCH 41/46] fuse: fix inode initialization race Fix a race between fuse_iget() and fuse_reverse_inval_inode() where invalidation can arrive while an inode is being initialized, causing the invalidation to be lost. Add a waitqueue to make fuse_reverse_inval_inode() wait when it encounters an inode with attr_version == 0 (still initializing). When fuse_change_attributes_common() completes initialization, it wakes waiting threads. This ensures invalidations are properly serialized with inode initialization, maintaining cache coherency. Signed-off-by: Horst Birthelmer (imported from commit 03eacfdec557c7574be00442563297d26ac50521) --- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1e14a1102e3c01..58dbd9dc7991ee 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -973,6 +973,9 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Waitqueue for attr_version initialization */ + wait_queue_head_t attr_version_waitq; + /** Version counter for evict inode */ atomic64_t evict_ctr; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 24db2d8af82d3e..e901446de52802 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -256,6 +256,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); fi->attr_version = atomic64_inc_return(&fc->attr_version); + wake_up_all(&fc->attr_version_waitq); fi->i_time = attr_valid; inode->i_ino = fuse_squash_ino(attr->ino); @@ -615,10 +616,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return -ENOENT; fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + while (fi->attr_version == 0) { + spin_unlock(&fi->lock); + wait_event(fc->attr_version_waitq, READ_ONCE(fi->attr_version) != 0); + spin_lock(&fi->lock); + } + fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); - + if (fc->inval_inode_entries) fuse_invalidate_inode_entry(inode); else if (fc->expire_inode_entries) @@ -1044,6 +1052,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, atomic_set(&fc->epoch, 1); INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->attr_version_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); From f65445e2a0f451bd78f1f3cfb83afc565019ac6f Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Thu, 26 Mar 2026 11:12:54 +0100 Subject: [PATCH 42/46] fuse: debug print requests when we hang in fuse_wait_aborted() Signed-off-by: Horst Birthelmer (imported from commit ad21e5a936b5a7ad4050ebb4d118ee96a5635104) --- fs/fuse/dev.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1c355999cf9b31..90117fb13ec8e0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -585,7 +585,8 @@ static void request_wait_answer(struct fuse_req *req) * Either request is already in userspace, or it was forced. * Wait it out. */ - wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); + wait_event(req->waitq, + test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_req *req) @@ -2565,11 +2566,119 @@ void fuse_abort_conn(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_abort_conn); +static void fuse_debug_print_outstanding_reqs(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + pr_warn("FUSE: fuse_wait_aborted: num_waiting=%d (should be 0)\n", + atomic_read(&fc->num_waiting)); + +#ifdef CONFIG_FUSE_IO_URING + /* Print io_uring state if enabled */ + if (fc->ring) { + struct fuse_ring *ring = fc->ring; + + pr_warn("FUSE: io_uring enabled - queue_refs=%d ready=%d\n", + atomic_read(&ring->queue_refs), ring->ready); + } +#endif + + /* Print all outstanding requests - lockless for debug */ + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + struct fuse_req *req; + int i; + + /* Print all requests on fpq->io */ + if (!list_empty(&fpq->io)) { + pr_warn("FUSE: Outstanding requests on fpq->io:\n"); + list_for_each_entry(req, &fpq->io, list) { +#ifdef CONFIG_FUSE_IO_URING + if (test_bit(FR_URING, &req->flags) && + req->ring_entry) { + struct fuse_ring_ent *ent = req->ring_entry; + + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d ring_ent=%p state=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags), + ent, ent->state); + } else { +#endif + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags)); +#ifdef CONFIG_FUSE_IO_URING + } +#endif + } + } + + /* Print all requests on fpq->processing */ + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) { + if (list_empty(&fpq->processing[i])) + continue; + + pr_warn("FUSE: Outstanding requests on fpq->processing[%d]:\n", + i); + list_for_each_entry(req, &fpq->processing[i], list) { +#ifdef CONFIG_FUSE_IO_URING + if (test_bit(FR_URING, &req->flags) && + req->ring_entry) { + struct fuse_ring_ent *ent = req->ring_entry; + + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d ring_ent=%p state=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags), + ent, ent->state); + } else { +#endif + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags)); +#ifdef CONFIG_FUSE_IO_URING + } +#endif + } + } + } +} + void fuse_wait_aborted(struct fuse_conn *fc) { + unsigned int timeout = 20; + /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); - wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + +wait: + wait_event_timeout(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0, HZ * timeout); + + /* Debug: print info if we're waiting */ + if (atomic_read(&fc->num_waiting) > 0) { + fuse_debug_print_outstanding_reqs(fc); + timeout *= 3; + goto wait; + } fuse_uring_wait_stopped_queues(fc); } From 39d2998f1a951d4865cff10ca4d473ecf2811a20 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Thu, 2 Apr 2026 08:14:42 +0200 Subject: [PATCH 43/46] fuse: fix io_uring connection abort leaving requests stuck Fix uninterruptible sleep (D state) hangs during FUSE filesystem teardown when using io_uring. The issue manifests as processes stuck waiting for requests that are never completed, particularly affecting force requests like FUSE_FLUSH or when requests are created after fuse_abort_conn() already finished. If on daemon exit io_uring_try_cancel_requests() runs and calls fuse_uring_cancel() which will teardown the entries by calling fuse_uring_entry_teardown() before fuse_abort_conn() then we end up in fuse_uring_abort with queue_refs == 0 and the queues are never stopped. If the queues are stopped all new requests will be rejected, but that does not happen, so all new calls are stuck. Signed-off-by: Horst Birthelmer (imported from commit 9550b4d733625ff51bbff4fda533dee9cf8ae765) --- fs/fuse/dev_uring.c | 3 +-- fs/fuse/dev_uring_i.h | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 9c8f7f38193796..7fa79e68afd5c9 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -143,11 +143,10 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) if (!queue) continue; - queue->stopped = true; - WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); + queue->stopped = true; fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 86fef37a863a1e..4518990e98bdd5 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -179,10 +179,8 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) if (ring == NULL) return; - if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_flush_bg(fc); - fuse_uring_stop_queues(ring); - } + fuse_uring_flush_bg(fc); + fuse_uring_stop_queues(ring); } static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) From 17fd84583e4d035d2eaf0f59be2df908d6ee1694 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Fri, 6 Mar 2026 17:16:09 +0800 Subject: [PATCH 44/46] fuse: invalidate page cache after sync and async direct writes Fixes xfstests generic/451, similar to how commit b359af8275a9 ("fuse: Invalidate the page cache after FOPEN_DIRECT_IO write") fixes xfstests generic/209. Signed-off-by: Cheng Ding (imported from commit 51e07998077c2ad0ac83ed3f00435a3a50ded2bf) --- fs/fuse/file.c | 59 +++++++++++++++++++++++++++++++++++++++--------- fs/fuse/fuse_i.h | 1 + 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f24be6fb886461..170cf0b778189c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -24,6 +24,8 @@ #include #include +int sb_init_dio_done_wq(struct super_block *sb); + /* * Helper function to initialize fuse_args for OPEN/OPENDIR operations */ @@ -736,6 +738,19 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) return io->bytes < 0 ? io->size : io->bytes; } +static void fuse_aio_invalidate_worker(struct work_struct *work) +{ + struct fuse_io_priv *io = container_of(work, struct fuse_io_priv, work); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; + ssize_t res = fuse_get_res_by_io(io); + pgoff_t start = io->offset >> PAGE_SHIFT; + pgoff_t end = (io->offset + res - 1) >> PAGE_SHIFT; + + invalidate_inode_pages2_range(mapping, start, end); + io->iocb->ki_complete(io->iocb, res); + kref_put(&io->refcnt, fuse_io_release); +} + /* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested @@ -768,10 +783,11 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&io->lock); if (!left && !io->blocking) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; ssize_t res = fuse_get_res_by_io(io); if (res >= 0) { - struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -780,6 +796,17 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fi->lock); } + if (io->write && res > 0 && mapping->nrpages) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + INIT_WORK(&io->work, fuse_aio_invalidate_worker); + queue_work(inode->i_sb->s_dio_done_wq, &io->work); + return; + } + io->iocb->ki_complete(io->iocb, res); } @@ -1859,15 +1886,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; - if (res > 0 && write && fopen_direct_io) { - /* - * As in generic_file_direct_write(), invalidate after the - * write, to invalidate read-ahead cache that may have competed - * with the write. - */ - invalidate_inode_pages2_range(mapping, idx_from, idx_to); - } - return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); @@ -1906,6 +1924,8 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct address_space *mapping = inode->i_mapping; + loff_t pos = iocb->ki_pos; ssize_t res; bool exclusive; @@ -1922,6 +1942,16 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); } + if (res > 0 && mapping->nrpages) { + /* + * As in generic_file_direct_write(), invalidate after + * write, to invalidate read-ahead cache that may have + * with the write. + */ + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, + (pos + res - 1) >> PAGE_SHIFT); + } } fuse_dio_unlock(iocb, exclusive); @@ -3012,6 +3042,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; + bool async = ff->fm->fc->async_dio; pos = offset; inode = file->f_mapping->host; @@ -3020,6 +3051,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; + if ((iov_iter_rw(iter) == WRITE) && async && !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + return ret; + } + io = kmalloc_obj(struct fuse_io_priv); if (!io) return -ENOMEM; @@ -3035,7 +3072,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fm->fc->async_dio; + io->async = async; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 58dbd9dc7991ee..d6b68d12be9307 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -393,6 +393,7 @@ union fuse_file_args { /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { struct kref refcnt; + struct work_struct work; int async; spinlock_t lock; unsigned reqs; From 4f790a1965d3903e7f8996ddbe5407191c11b5dd Mon Sep 17 00:00:00 2001 From: kchen Date: Tue, 21 Apr 2026 09:15:15 +0000 Subject: [PATCH 45/46] Invalidate selinux security label during inode invalidation. Add security_inode_invalidate_secctx() call to invalidate cached security context when inode attributes change. This ensures that SELinux security labels are properly refreshed and prevents stale security context from being used after inode modifications. Signed-off-by: Kevin Chen (imported from commit 6c9ec1dca008983d5eaecda066aed49edbd6cae7) --- fs/fuse/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e901446de52802..25fe7f81815b26 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -634,6 +634,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, fuse_invalidate_attr(inode); forget_all_cached_acls(inode); + security_inode_invalidate_secctx(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) From b3218850c73de39b4402e459cac34a8f75cdbbed Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Tue, 28 Apr 2026 20:14:06 +0200 Subject: [PATCH 46/46] ubuntu: fix makefile compiler warning option to support LLVM Signed-off-by: Horst Birthelmer --- ubuntu/igh-ecat/master/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/igh-ecat/master/Makefile b/ubuntu/igh-ecat/master/Makefile index 8aef742ebb2f74..4f8fc539c6ab6c 100644 --- a/ubuntu/igh-ecat/master/Makefile +++ b/ubuntu/igh-ecat/master/Makefile @@ -1,5 +1,5 @@ ccflags-y := -I$(src)/../ \ - -Wmaybe-uninitialized + -Wuninitialized obj-$(CONFIG_IGH_ECAT) += ec_master.o