diff --git a/.github/workflows/create-redfs-pr.yml b/.github/workflows/create-redfs-pr.yml new file mode 100644 index 00000000000000..cc03d7e1219e9b --- /dev/null +++ b/.github/workflows/create-redfs-pr.yml @@ -0,0 +1,93 @@ +# Automatially run copy-from-linux-branch.sh on branches and create PR for redfs. +name: Sync to redfs repo +on: + # Triggers the workflow on pull request merged. + pull_request: + branches: [ "redfs-*" ] + types: [ "closed" ] + +jobs: + create-redfs-pr: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + # Checks-out to a different directory to avoid following checkout removing it. + - uses: actions/checkout@v4 + with: + path: linux + + - name: Try to checkout sync-${{ github.ref_name }} if it exists + uses: actions/checkout@v4 + id: try-checkout + continue-on-error: true + with: + repository: DDNStorage/redfs + ref: sync-${{ github.ref_name }} + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Fallback to checkout main + if: steps.try-checkout.outcome == 'failure' + uses: actions/checkout@v4 + with: + repository: DDNStorage/redfs + ref: main + fetch-depth: 0 + path: redfs + token: ${{ secrets.REDFS_TOKEN }} + + - name: Initialize git + run: | + git config --global user.name "DDNStorage RED Workflow" + git config --global user.email "red@ddn.com" + + - name: Create tracking branch based on main + if: steps.try-checkout.outcome == 'failure' + run: | + pushd redfs + git checkout -b sync-${{ github.ref_name }} + popd + + - name: Generate PR for redfs + run: | + declare -A MAP + MAP["redfs-rhel9_4-427.42.1"]="5.14.0-427.42.1.el9_4" + MAP["redfs-rhel9_5-503.40.1"]="5.14.0-503.40.1.el9_5" + MAP["redfs-rhel9_6-570.12.1"]="5.14.0-570.12.1.el9_6" + MAP["redfs-ubuntu-noble-6.8.0-58.60"]="6.8.0-58.60.ubuntu" + kerver=${MAP["${{ github.ref_name }}"]} + if [ -z ${kerver} ]; then + echo "Cannot find target kernel version" + exit 1 + fi + pushd redfs + ./copy-from-linux-branch.sh $GITHUB_WORKSPACE/linux ${kerver} + git add src/$kerver + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }}\n" > ../commit.msg + echo -e "Sync with ${{ github.repository }} branch ${{ github.ref_name }} by commit" >> ../commit.msg + echo -e "${{ github.sha }}" >> ../commit.msg + RET=0 + git commit -F ../commit.msg 2> ../commit.log || RET=$?; + if [ -s ../commit.log ]; then + echo "Error detcted in commit:" + cat ../commit.log + exit 1 + elif [ $RET -eq 0 ]; then + echo "Done. Push the code to remote:" + git push origin sync-${{ github.ref_name }} 2> ../push.log ||: + else + echo "No changes to existed codes. Still try with PR." + fi + if [ -s ../push.log ]; then + echo "Message detected in push:" + cat ../push.log + fi + gh pr create --base main --fill || RET=$? + if [ $RET -eq 1 ]; then + echo "No pending changes for PR, returning $RET." + fi + popd + env: + GH_TOKEN: ${{ secrets.REDFS_TOKEN }} + diff --git a/debian/scripts/misc/kconfig/__init__.py b/debian/scripts/misc/kconfig/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 22ad9538dfc4b8..2407870803000d 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := trace.o # put trace.o first so we see ftrace errors sooner -fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o compound.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o diff --git a/fs/fuse/compound.c b/fs/fuse/compound.c new file mode 100644 index 00000000000000..5d84e3558a06f8 --- /dev/null +++ b/fs/fuse/compound.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2025 + * + * This file implements compound operations for FUSE, allowing multiple + * operations to be batched into a single request to reduce round trips + * between kernel and userspace. + */ + +#include "fuse_i.h" + +/* + * Compound request builder and state tracker and args pointer storage + */ +struct fuse_compound_req { + struct fuse_mount *fm; + struct fuse_compound_in compound_header; + struct fuse_compound_out result_header; + + /* Per-operation error codes */ + int op_errors[FUSE_MAX_COMPOUND_OPS]; + struct fuse_args *op_args[FUSE_MAX_COMPOUND_OPS]; +}; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, u32 flags) +{ + struct fuse_compound_req *compound; + + compound = kzalloc(sizeof(*compound), GFP_KERNEL); + if (!compound) + return ERR_PTR(-ENOMEM); + + compound->fm = fm; + compound->compound_header.flags = flags; + + return compound; +} + +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args) +{ + if (!compound || + compound->compound_header.count >= FUSE_MAX_COMPOUND_OPS) + return -EINVAL; + + if (args->in_pages) + return -EINVAL; + + compound->op_args[compound->compound_header.count] = args; + compound->compound_header.count++; + return 0; +} + +static void *fuse_copy_response_per_req(struct fuse_args *args, + char *resp) +{ + int i; + size_t copied = 0; + + for (i = 0; i < args->out_numargs; i++) { + struct fuse_arg current_arg = args->out_args[i]; + size_t arg_size = current_arg.size; + + if (current_arg.value && arg_size > 0) { + memcpy(current_arg.value, + (char *)resp + copied, arg_size); + copied += arg_size; + } + } + + return (char *)resp + copied; +} + +int fuse_compound_get_error(struct fuse_compound_req *compound, int op_idx) +{ + return compound->op_errors[op_idx]; +} + +static void *fuse_compound_parse_one_op(struct fuse_compound_req *compound, + int op_index, void *op_out_data, + void *response_end) +{ + struct fuse_out_header *op_hdr = op_out_data; + struct fuse_args *args = compound->op_args[op_index]; + + if (op_hdr->len < sizeof(struct fuse_out_header)) + return NULL; + + /* Check if the entire operation response fits in the buffer */ + if ((char *)op_out_data + op_hdr->len > (char *)response_end) + return NULL; + + if (op_hdr->error != 0) + compound->op_errors[op_index] = op_hdr->error; + + if (args && op_hdr->len > sizeof(struct fuse_out_header)) + return fuse_copy_response_per_req(args, op_out_data + + sizeof(struct fuse_out_header)); + + /* No response data, just advance past the header */ + return (char *)op_out_data + op_hdr->len; +} + +static int fuse_compound_parse_resp(struct fuse_compound_req *compound, + u32 count, void *response, + size_t response_size) +{ + void *op_out_data = response; + void *response_end = (char *)response + response_size; + int i; + + if (!response || response_size < sizeof(struct fuse_out_header)) + return -EIO; + + for (i = 0; i < count && i < compound->result_header.count; i++) { + op_out_data = fuse_compound_parse_one_op(compound, i, + op_out_data, + response_end); + if (!op_out_data) + return -EIO; + } + + return 0; +} + +ssize_t fuse_compound_send(struct fuse_compound_req *compound) +{ + struct fuse_args args = { + .opcode = FUSE_COMPOUND, + .nodeid = 0, + .in_numargs = 2, + .out_numargs = 2, + .out_argvar = true, + }; + size_t resp_buffer_size; + size_t actual_response_size; + size_t buffer_pos; + size_t total_expected_out_size; + void *buffer = NULL; + void *resp_payload; + ssize_t ret; + int i; + + if (!compound) { + pr_info_ratelimited("FUSE: compound request is NULL in %s\n", + __func__); + return -EINVAL; + } + + if (compound->compound_header.count == 0) { + pr_info_ratelimited("FUSE: compound request contains no operations\n"); + return -EINVAL; + } + + buffer_pos = 0; + total_expected_out_size = 0; + + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + buffer_pos += needed_size; + + for (j = 0; j < op_args->out_numargs; j++) + total_expected_out_size += op_args->out_args[j].size; + } + + buffer = kvmalloc(buffer_pos, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + buffer_pos = 0; + for (i = 0; i < compound->compound_header.count; i++) { + struct fuse_args *op_args = compound->op_args[i]; + struct fuse_in_header *hdr; + size_t needed_size = sizeof(struct fuse_in_header); + int j; + + for (j = 0; j < op_args->in_numargs; j++) + needed_size += op_args->in_args[j].size; + + hdr = (struct fuse_in_header *)(buffer + buffer_pos); + memset(hdr, 0, sizeof(*hdr)); + hdr->len = needed_size; + hdr->opcode = op_args->opcode; + hdr->nodeid = op_args->nodeid; + hdr->uid = from_kuid(compound->fm->fc->user_ns, + current_fsuid()); + hdr->gid = from_kgid(compound->fm->fc->user_ns, + current_fsgid()); + hdr->pid = pid_nr_ns(task_pid(current), + compound->fm->fc->pid_ns); + buffer_pos += sizeof(*hdr); + + for (j = 0; j < op_args->in_numargs; j++) { + memcpy(buffer + buffer_pos, op_args->in_args[j].value, + op_args->in_args[j].size); + buffer_pos += op_args->in_args[j].size; + } + } + + resp_buffer_size = total_expected_out_size + + (compound->compound_header.count * + sizeof(struct fuse_out_header)); + + resp_payload = kvmalloc(resp_buffer_size, GFP_KERNEL | __GFP_ZERO); + if (!resp_payload) { + ret = -ENOMEM; + goto out_free_buffer; + } + + compound->compound_header.result_size = total_expected_out_size; + + args.in_args[0].size = sizeof(compound->compound_header); + args.in_args[0].value = &compound->compound_header; + args.in_args[1].size = buffer_pos; + args.in_args[1].value = buffer; + + args.out_args[0].size = sizeof(compound->result_header); + args.out_args[0].value = &compound->result_header; + args.out_args[1].size = resp_buffer_size; + args.out_args[1].value = resp_payload; + + ret = fuse_simple_request(compound->fm, &args); + if (ret < 0) + goto out; + + actual_response_size = args.out_args[1].size; + + if (actual_response_size < sizeof(struct fuse_compound_out)) { + pr_info_ratelimited("FUSE: compound response too small (%zu bytes, minimum %zu bytes)\n", + actual_response_size, + sizeof(struct fuse_compound_out)); + ret = -EINVAL; + goto out; + } + + ret = fuse_compound_parse_resp(compound, compound->result_header.count, + (char *)resp_payload, + actual_response_size); +out: + kvfree(resp_payload); +out_free_buffer: + kvfree(buffer); + return ret; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0b0241f47170d4..90117fb13ec8e0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include "fuse_trace.h" @@ -418,6 +418,9 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + + /* enqueue, as it is send to "fiq->ops queue" */ + trace_fuse_request_enqueue(req); fiq->ops->send_req(fiq, req); } @@ -582,7 +585,8 @@ static void request_wait_answer(struct fuse_req *req) * Either request is already in userspace, or it was forced. * Wait it out. */ - wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); + wait_event(req->waitq, + test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_req *req) @@ -660,6 +664,30 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } +ssize_t fuse_compound_request(struct fuse_mount *fm, struct fuse_args *args) +{ + struct fuse_req *req; + ssize_t ret; + + req = fuse_get_req(&invalid_mnt_idmap, fm, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_args_to_req(req, args); + + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + + __fuse_request_send(req); + ret = req->out.h.error; + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + ret = args->out_args[args->out_numargs - 1].size; + } + fuse_put_request(req); + return ret; +} + ssize_t __fuse_simple_request(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_args *args) @@ -732,6 +760,8 @@ static int fuse_request_queue_background(struct fuse_req *req) } __set_bit(FR_ISREPLY, &req->flags); + trace_fuse_request_bg_enqueue(req); + #ifdef CONFIG_FUSE_IO_URING if (fuse_uring_ready(fc)) return fuse_request_queue_background_uring(fc, req); @@ -800,38 +830,14 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, return 0; } -/* - * Lock the request. Up to the next unlock_request() there mustn't be - * anything that could cause a page-fault. If the request was already - * aborted bail out. - */ -static int lock_request(struct fuse_req *req) -{ - int err = 0; - if (req) { - spin_lock(&req->waitq.lock); - if (test_bit(FR_ABORTED, &req->flags)) - err = -ENOENT; - else - set_bit(FR_LOCKED, &req->flags); - spin_unlock(&req->waitq.lock); - } - return err; -} -/* - * Unlock request. If it was aborted while locked, caller is responsible - * for unlocking and ending the request. - */ -static int unlock_request(struct fuse_req *req) +static int check_req_aborted(struct fuse_req *req) { int err = 0; - if (req) { + if (req && test_bit(FR_ABORTED, &req->flags)) { spin_lock(&req->waitq.lock); if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; - else - clear_bit(FR_LOCKED, &req->flags); spin_unlock(&req->waitq.lock); } return err; @@ -873,7 +879,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct page *page; int err; - err = unlock_request(cs->req); + err = check_req_aborted(cs->req); if (err) return err; @@ -912,6 +918,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs++; } + } else if (cs->ring.pages) { + cs->pg = cs->ring.pages[cs->ring.page_idx++]; + /* + * non stricly needed, just to avoid a uring exception in + * fuse_copy_finish + */ + get_page(cs->pg); + cs->len = PAGE_SIZE; + cs->offset = 0; } else { size_t off; err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); @@ -923,7 +938,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pg = page; } - return lock_request(cs->req); + return 0; } /* Do as much copy to/from userspace buffer as we can */ @@ -984,9 +999,6 @@ static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop struct pipe_buffer *buf = cs->pipebufs; folio_get(oldfolio); - err = unlock_request(cs->req); - if (err) - goto out_put_old; fuse_copy_finish(cs); @@ -1072,9 +1084,7 @@ static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop cs->pg = buf->page; cs->offset = buf->offset; - err = lock_request(cs->req); - if (!err) - err = 1; + err = 1; goto out_put_old; } @@ -1083,17 +1093,11 @@ static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, unsigned offset, unsigned count) { struct pipe_buffer *buf; - int err; if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; folio_get(folio); - err = unlock_request(cs->req); - if (err) { - folio_put(folio); - return err; - } fuse_copy_finish(cs); @@ -1467,6 +1471,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); spin_unlock(&fiq->lock); + trace_fuse_request_send(req); args = req->args; reqsize = req->in.h.len; @@ -2430,6 +2435,45 @@ static void end_polls(struct fuse_conn *fc) } } +/* + * Flush all pending requests and wait for them. Only call this function when + * it is no longer possible for other threads to add requests. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) +{ + unsigned long deadline; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + /* Push all the background requests to the queue. */ + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + spin_unlock(&fc->lock); + + fuse_uring_flush_bg(fc); + + /* + * Wait 30s for all the events to complete or abort. Touch the + * watchdog once per second so that we don't trip the hangcheck timer + * while waiting for the fuse server. + */ + deadline = jiffies + timeout; + smp_mb(); + while (fc->connected && + (!timeout || time_before(jiffies, deadline)) && + wait_event_timeout(fc->blocked_waitq, + !fc->connected || atomic_read(&fc->num_waiting) == 0, + HZ) == 0) + touch_softlockup_watchdog(); +} + /* * Abort all requests. * @@ -2522,11 +2566,119 @@ void fuse_abort_conn(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_abort_conn); +static void fuse_debug_print_outstanding_reqs(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + pr_warn("FUSE: fuse_wait_aborted: num_waiting=%d (should be 0)\n", + atomic_read(&fc->num_waiting)); + +#ifdef CONFIG_FUSE_IO_URING + /* Print io_uring state if enabled */ + if (fc->ring) { + struct fuse_ring *ring = fc->ring; + + pr_warn("FUSE: io_uring enabled - queue_refs=%d ready=%d\n", + atomic_read(&ring->queue_refs), ring->ready); + } +#endif + + /* Print all outstanding requests - lockless for debug */ + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + struct fuse_req *req; + int i; + + /* Print all requests on fpq->io */ + if (!list_empty(&fpq->io)) { + pr_warn("FUSE: Outstanding requests on fpq->io:\n"); + list_for_each_entry(req, &fpq->io, list) { +#ifdef CONFIG_FUSE_IO_URING + if (test_bit(FR_URING, &req->flags) && + req->ring_entry) { + struct fuse_ring_ent *ent = req->ring_entry; + + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d ring_ent=%p state=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags), + ent, ent->state); + } else { +#endif + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags)); +#ifdef CONFIG_FUSE_IO_URING + } +#endif + } + } + + /* Print all requests on fpq->processing */ + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) { + if (list_empty(&fpq->processing[i])) + continue; + + pr_warn("FUSE: Outstanding requests on fpq->processing[%d]:\n", + i); + list_for_each_entry(req, &fpq->processing[i], list) { +#ifdef CONFIG_FUSE_IO_URING + if (test_bit(FR_URING, &req->flags) && + req->ring_entry) { + struct fuse_ring_ent *ent = req->ring_entry; + + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d ring_ent=%p state=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags), + ent, ent->state); + } else { +#endif + pr_warn(" req %p: opcode=%u unique=%llu flags=0x%lx FR_WAITING=%d FR_LOCKED=%d FR_FORCE=%d FR_ABORTED=%d FR_URING=%d\n", + req, req->in.h.opcode, + req->in.h.unique, req->flags, + test_bit(FR_WAITING, &req->flags), + test_bit(FR_LOCKED, &req->flags), + test_bit(FR_FORCE, &req->flags), + test_bit(FR_ABORTED, &req->flags), + test_bit(FR_URING, &req->flags)); +#ifdef CONFIG_FUSE_IO_URING + } +#endif + } + } + } +} + void fuse_wait_aborted(struct fuse_conn *fc) { + unsigned int timeout = 20; + /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); - wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + +wait: + wait_event_timeout(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0, HZ * timeout); + + /* Debug: print info if we're waiting */ + if (atomic_read(&fc->num_waiting) > 0) { + fuse_debug_print_outstanding_reqs(fc); + timeout *= 3; + goto wait; + } fuse_uring_wait_stopped_queues(fc); } diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 3a38b61aac26f7..7fa79e68afd5c9 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -11,6 +11,7 @@ #include #include +#include static bool __read_mostly enable_uring; module_param(enable_uring, bool, 0644); @@ -18,6 +19,14 @@ MODULE_PARM_DESC(enable_uring, "Enable userspace communication through io-uring"); #define FUSE_URING_IOV_SEGS 2 /* header and payload */ +#define FUSE_RING_HEADER_PG 0 +#define FUSE_RING_PAYLOAD_PG 1 + +/* Threshold that determines if a better queue should be searched for */ +#define FUSE_URING_Q_THRESHOLD 2 + +/* Number of (re)tries to find a better queue */ +#define FUSE_URING_Q_TRIES 3 bool fuse_uring_enabled(void) @@ -48,7 +57,7 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } -static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +static void fuse_uring_flush_queue_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; @@ -86,14 +95,14 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + queue->nr_reqs--; list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } - spin_unlock(&queue->lock); if (error) @@ -113,32 +122,69 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); + queue->nr_reqs = 0; spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ fuse_dev_end_requests(&req_list); } -void fuse_uring_abort_end_requests(struct fuse_ring *ring) +void fuse_uring_flush_bg(struct fuse_conn *fc) { int qid; struct fuse_ring_queue *queue; - struct fuse_conn *fc = ring->fc; + struct fuse_ring *ring = fc->ring; + + if (!ring) + return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; - queue->stopped = true; - WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + queue->stopped = true; + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); - fuse_uring_abort_end_queue_requests(queue); + } +} + +/* + * Copy from memmap.c, should be exported + */ +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + + +static void fuse_ring_destruct_q_map(struct fuse_queue_map *q_map) +{ + free_cpumask_var(q_map->registered_q_mask); + kfree(q_map->cpu_to_qid); +} + +static void fuse_uring_destruct_q_masks(struct fuse_ring *ring) +{ + int node; + + fuse_ring_destruct_q_map(&ring->q_map); + + if (ring->numa_q_map) { + for (node = 0; node < ring->nr_numa_nodes; node++) + fuse_ring_destruct_q_map(&ring->numa_q_map[node]); + kfree(ring->numa_q_map); } } @@ -166,7 +212,7 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) if (!ring) return false; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -193,7 +239,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -208,6 +254,9 @@ void fuse_uring_destruct(struct fuse_conn *fc) list_for_each_entry_safe(ent, next, &queue->ent_released, list) { list_del_init(&ent->list); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + io_pages_free(&ent->payload_pages, + ent->nr_payload_pages); kfree(ent); } @@ -216,11 +265,47 @@ void fuse_uring_destruct(struct fuse_conn *fc) ring->queues[qid] = NULL; } + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); fc->ring = NULL; } +static int fuse_uring_init_q_map(struct fuse_queue_map *q_map, size_t nr_cpu) +{ + if (!zalloc_cpumask_var(&q_map->registered_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + q_map->cpu_to_qid = kcalloc(nr_cpu, sizeof(*q_map->cpu_to_qid), + GFP_KERNEL_ACCOUNT); + if (!q_map->cpu_to_qid) + return -ENOMEM; + + return 0; +} + +static int fuse_uring_create_q_masks(struct fuse_ring *ring, size_t nr_queues) +{ + int err, node; + + err = fuse_uring_init_q_map(&ring->q_map, nr_queues); + if (err) + return err; + + ring->numa_q_map = kcalloc(ring->nr_numa_nodes, + sizeof(*ring->numa_q_map), + GFP_KERNEL_ACCOUNT); + if (!ring->numa_q_map) + return -ENOMEM; + for (node = 0; node < ring->nr_numa_nodes; node++) { + err = fuse_uring_init_q_map(&ring->numa_q_map[node], + nr_queues); + if (err) + return err; + } + return 0; +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -230,19 +315,26 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; + int err; ring = kzalloc_obj(*fc->ring, GFP_KERNEL_ACCOUNT); if (!ring) return NULL; - ring->queues = kzalloc_objs(struct fuse_ring_queue *, nr_queues, - GFP_KERNEL_ACCOUNT); + ring->nr_numa_nodes = num_online_nodes(); + + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), + GFP_KERNEL_ACCOUNT); if (!ring->queues) goto out_err; max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + err = fuse_uring_create_q_masks(ring, nr_queues); + if (err) + goto out_err; + spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ @@ -253,7 +345,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - ring->nr_queues = nr_queues; + ring->max_nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; smp_store_release(&fc->ring, ring); @@ -262,17 +354,42 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return ring; out_err: + fuse_uring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); return res; } +static void fuse_uring_cpu_qid_mapping(struct fuse_ring *ring, int qid, + struct fuse_queue_map *q_map, + int node) +{ + int cpu, qid_idx, mapping_count = 0; + size_t nr_queues; + + cpumask_set_cpu(qid, q_map->registered_q_mask); + nr_queues = cpumask_weight(q_map->registered_q_mask); + for (cpu = 0; cpu < ring->max_nr_queues; cpu++) { + if (node != -1 && cpu_to_node(cpu) != node) + continue; + + qid_idx = mapping_count % nr_queues; + q_map->cpu_to_qid[cpu] = cpumask_nth(qid_idx, + q_map->registered_q_mask); + mapping_count++; + pr_debug("%s node=%d qid=%d qid_idx=%d nr_queues=%zu %d->%d\n", + __func__, node, qid, qid_idx, nr_queues, cpu, + q_map->cpu_to_qid[cpu]); + } +} + static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, int qid) { struct fuse_conn *fc = ring->fc; struct fuse_ring_queue *queue; struct list_head *pq; + int node; queue = kzalloc_obj(*queue, GFP_KERNEL_ACCOUNT); if (!queue) @@ -310,6 +427,22 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, * write_once and lock as the caller mostly doesn't take the lock at all */ WRITE_ONCE(ring->queues[qid], queue); + + /* Static mapping from cpu to per numa queues */ + node = cpu_to_node(qid); + fuse_uring_cpu_qid_mapping(ring, qid, &ring->numa_q_map[node], node); + + /* + * smp_store_release, as the variable is read without fc->lock and + * we need to avoid compiler re-ordering of updating the nr_queues + * and setting ring->numa_queues[node].cpu_to_qid above + */ + smp_store_release (&ring->numa_q_map[node].nr_queues, + ring->numa_q_map[node].nr_queues + 1); + + /* global mapping */ + fuse_uring_cpu_qid_mapping(ring, qid, &ring->q_map, -1); + spin_unlock(&fc->lock); return queue; @@ -325,11 +458,11 @@ static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) /* * Release a request/entry on connection tear down */ -static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent, int issue_flags) { struct fuse_req *req; struct io_uring_cmd *cmd; - + ssize_t queue_refs; struct fuse_ring_queue *queue = ent->queue; spin_lock(&queue->lock); @@ -353,19 +486,20 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); if (req) fuse_uring_stop_fuse_req_end(req); + + queue_refs = atomic_dec_return(&queue->ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); } static void fuse_uring_stop_list_entries(struct list_head *head, struct fuse_ring_queue *queue, enum fuse_ring_req_state exp_state) { - struct fuse_ring *ring = queue->ring; struct fuse_ring_ent *ent, *next; - ssize_t queue_refs = SSIZE_MAX; LIST_HEAD(to_teardown); spin_lock(&queue->lock); @@ -382,11 +516,8 @@ static void fuse_uring_stop_list_entries(struct list_head *head, spin_unlock(&queue->lock); /* no queue lock to avoid lock order issues */ - list_for_each_entry_safe(ent, next, &to_teardown, list) { - fuse_uring_entry_teardown(ent); - queue_refs = atomic_dec_return(&ring->queue_refs); - WARN_ON_ONCE(queue_refs < 0); - } + list_for_each_entry_safe(ent, next, &to_teardown, list) + fuse_uring_entry_teardown(ent, IO_URING_F_UNLOCKED); } static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) @@ -405,7 +536,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) int qid; struct fuse_ring_ent *ent; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) @@ -424,6 +555,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } + spin_unlock(&queue->lock); } ring->stop_debug_log = 1; @@ -436,7 +568,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -470,16 +602,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) void fuse_uring_stop_queues(struct fuse_ring *ring) { int qid; + int node; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; + fuse_uring_abort_end_queue_requests(queue); fuse_uring_teardown_entries(queue); } + /* Reset all queue masks, we won't process any more IO */ + cpumask_clear(ring->q_map.registered_q_mask); + for (node = 0; node < ring->nr_numa_nodes; node++) { + if (ring->numa_q_map) + cpumask_clear(ring->numa_q_map[node].registered_q_mask); + } + if (atomic_read(&ring->queue_refs) > 0) { ring->teardown_time = jiffies; INIT_DELAYED_WORK(&ring->async_teardown_work, @@ -502,7 +643,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, { struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue; - bool need_cmd_done = false; + bool teardown = false; /* * direct access on ent - it must not be destructed as long as @@ -511,17 +652,14 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, queue = ent->queue; spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { - ent->state = FRRS_USERSPACE; - list_move_tail(&ent->list, &queue->ent_in_userspace); - need_cmd_done = true; - ent->cmd = NULL; + ent->state = FRRS_TEARDOWN; + list_del_init(&ent->list); + teardown = true; } spin_unlock(&queue->lock); - if (need_cmd_done) { - /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); - } + if (teardown) + fuse_uring_entry_teardown(ent, issue_flags); } static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, @@ -598,12 +736,69 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, fuse_copy_init(&cs, false, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); fuse_copy_finish(&cs); return err; } +/* + * Copy data from the req to the ring buffer + * In order to be able to write into the ring buffer from the application, + * i.e. to avoid io_uring_cmd_complete_in_task(), the header needs to be + * pinned as well. + */ +static int fuse_uring_args_to_ring_pages(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent, + struct fuse_uring_req_header *headers) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + fuse_copy_init(&cs, 1, NULL); + cs.is_uring = 1; + cs.req = req; + cs.ring.pages = ent->payload_pages; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + memcpy(&headers->op_in, in_args->value, in_args->size); + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + goto copy_finish; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + memcpy(&headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); + +copy_finish: + fuse_copy_finish(&cs); + return err; +} + /* * Copy data from the req to the ring buffer */ @@ -630,6 +825,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_init(&cs, true, &iter); cs.is_uring = true; cs.req = req; + if (ent->payload_pages) + cs.ring.pages = ent->payload_pages; if (num_args > 0) { /* @@ -655,12 +852,14 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, fuse_copy_finish(&cs); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); - return err; + goto copy_finish; } ent_in_out.payload_sz = cs.ring.copied_sz; err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, sizeof(ent_in_out)); +copy_finish: + fuse_copy_finish(&cs); return err ? -EFAULT : 0; } @@ -670,6 +869,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue = ent->queue; struct fuse_ring *ring = queue->ring; int err; + struct fuse_uring_req_header *headers = NULL; err = -EIO; if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { @@ -682,22 +882,29 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, if (WARN_ON(req->in.h.unique == 0)) return err; - /* copy the request */ - err = fuse_uring_args_to_ring(ring, req, ent); - if (unlikely(err)) { - pr_info_ratelimited("Copy to ring failed: %d\n", err); - return err; - } - /* copy fuse_in_header */ - err = copy_to_user(&ent->headers->in_out, &req->in.h, - sizeof(req->in.h)); - if (err) { - err = -EFAULT; - return err; + if (ent->header_pages) { + headers = kmap_local_page( + ent->header_pages[FUSE_RING_HEADER_PG]); + + memcpy(&headers->in_out, &req->in.h, sizeof(req->in.h)); + + err = fuse_uring_args_to_ring_pages(ring, req, ent, headers); + kunmap_local(headers); + } else { + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) + err = -EFAULT; } - return 0; + return err; } static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, @@ -894,7 +1101,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, if (!ring) return err; - if (qid >= ring->nr_queues) + if (qid >= ring->max_nr_queues) return -EINVAL; queue = ring->queues[qid]; @@ -951,59 +1158,43 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -static bool is_ring_ready(struct fuse_ring *ring, int current_qid) -{ - int qid; - struct fuse_ring_queue *queue; - bool ready = true; - - for (qid = 0; qid < ring->nr_queues && ready; qid++) { - if (current_qid == qid) - continue; - - queue = ring->queues[qid]; - if (!queue) { - ready = false; - break; - } - - spin_lock(&queue->lock); - if (list_empty(&queue->ent_avail_queue)) - ready = false; - spin_unlock(&queue->lock); - } - - return ready; -} - /* - * fuse_uring_req_fetch command handling + * Copy from memmap.c, should be exported there */ -static void fuse_uring_do_register(struct fuse_ring_ent *ent, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +static struct page **io_pin_pages(unsigned long uaddr, unsigned long len, + int *npages) { - struct fuse_ring_queue *queue = ent->queue; - struct fuse_ring *ring = queue->ring; - struct fuse_conn *fc = ring->fc; - struct fuse_iqueue *fiq = &fc->iq; - - fuse_uring_prepare_cancel(cmd, issue_flags, ent); - - spin_lock(&queue->lock); - ent->cmd = cmd; - fuse_uring_ent_avail(ent, queue); - spin_unlock(&queue->lock); - - if (!ring->ready) { - bool ready = is_ring_ready(ring, queue->qid); + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } - if (ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; } + kvfree(pages); + return ERR_PTR(ret); } /* @@ -1032,6 +1223,59 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, return 0; } +static int fuse_uring_pin_pages(struct fuse_ring_ent *ent) +{ + struct fuse_ring *ring = ent->queue->ring; + int err; + + /* + * This needs to do locked memory accounting, for now privileged servers + * only. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* Pin header pages */ + if (!PAGE_ALIGNED(ent->headers)) { + pr_info_ratelimited("ent->headers is not page-aligned: %p\n", + ent->headers); + return -EINVAL; + } + + ent->header_pages = io_pin_pages((unsigned long)ent->headers, + sizeof(struct fuse_uring_req_header), + &ent->nr_header_pages); + if (IS_ERR(ent->header_pages)) { + err = PTR_ERR(ent->header_pages); + pr_info_ratelimited("Failed to pin header pages, err=%d\n", + err); + ent->header_pages = NULL; + return err; + } + + if (ent->nr_header_pages != 1) { + pr_info_ratelimited("Header pages not pinned as one page\n"); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->header_pages = NULL; + return -EINVAL; + } + + /* Pin payload pages */ + ent->payload_pages = io_pin_pages((unsigned long)ent->payload, + ring->max_payload_sz, + &ent->nr_payload_pages); + if (IS_ERR(ent->payload_pages)) { + err = PTR_ERR(ent->payload_pages); + pr_info_ratelimited("Failed to pin payload pages, err=%d\n", + err); + io_pages_free(&ent->header_pages, ent->nr_header_pages); + ent->payload_pages = NULL; + return err; + } + + return 0; +} + static struct fuse_ring_ent * fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, struct fuse_ring_queue *queue) @@ -1073,6 +1317,12 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, ent->headers = iov[0].iov_base; ent->payload = iov[1].iov_base; + err = fuse_uring_pin_pages(ent); + if (err) { + kfree(ent); + return ERR_PTR(err); + } + atomic_inc(&ring->queue_refs); return ent; } @@ -1089,6 +1339,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; + struct fuse_iqueue *fiq = &fc->iq; int err; unsigned int qid = READ_ONCE(cmd_req->qid); @@ -1099,7 +1350,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, return err; } - if (qid >= ring->nr_queues) { + if (qid >= ring->max_nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } @@ -1120,7 +1371,19 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, if (IS_ERR(ent)) return PTR_ERR(ent); - fuse_uring_do_register(ent, cmd, issue_flags); + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + if (!ring->ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + + spin_lock(&queue->lock); + ent->cmd = cmd; + spin_unlock(&queue->lock); + + /* Marks the ring entry as ready */ + fuse_uring_next_fuse_req(ent, queue, issue_flags); return 0; } @@ -1207,6 +1470,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); + trace_fuse_request_send(ent->fuse_req); io_uring_cmd_done(cmd, ret, issue_flags); } @@ -1236,30 +1500,107 @@ static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +static struct fuse_ring_queue *fuse_uring_select_queue(struct fuse_ring *ring, + bool background) { unsigned int qid; - struct fuse_ring_queue *queue; + int node, tries = 0; + unsigned int nr_queues; + unsigned int cpu = task_cpu(current); + struct fuse_ring_queue *queue, *primary_queue = NULL; - qid = task_cpu(current); + /* + * Background requests result in better performance on a different + * CPU, unless CPUs are already busy. + */ + if (background) + cpu++; - if (WARN_ONCE(qid >= ring->nr_queues, - "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->nr_queues)) - qid = 0; +retry: + cpu = cpu % ring->max_nr_queues; + + /* numa local registered queue bitmap */ + node = cpu_to_node(cpu); + if (WARN_ONCE(node >= ring->nr_numa_nodes, + "Node number (%d) exceeds nr nodes (%d)\n", + node, ring->nr_numa_nodes)) { + node = 0; + } - queue = ring->queues[qid]; - WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + nr_queues = READ_ONCE(ring->numa_q_map[node].nr_queues); + if (nr_queues) { + /* prefer the queue that corresponds to the current cpu */ + queue = READ_ONCE(ring->queues[cpu]); + if (queue) { + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + primary_queue = queue; + } - return queue; + qid = ring->numa_q_map[node].cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) + return NULL; + if (qid != cpu) { + queue = READ_ONCE(ring->queues[qid]); + + /* Might happen on teardown */ + if (unlikely(!queue)) + return NULL; + + if (queue->nr_reqs <= FUSE_URING_Q_THRESHOLD) + return queue; + } + + /* Retries help for load balancing */ + if (tries < FUSE_URING_Q_TRIES && tries + 1 < nr_queues) { + if (!primary_queue) + primary_queue = queue; + + /* Increase cpu, assuming it will map to a different qid*/ + cpu++; + tries++; + goto retry; + } + } + + /* Retries exceeded, take the primary target queue */ + if (primary_queue) + return primary_queue; + + /* global registered queue bitmap */ + qid = ring->q_map.cpu_to_qid[cpu]; + if (WARN_ON_ONCE(qid >= ring->max_nr_queues)) { + /* Might happen on teardown */ + return NULL; + } + return READ_ONCE(ring->queues[qid]); } -static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) { struct io_uring_cmd *cmd = ent->cmd; - uring_cmd_set_ring_ent(cmd, ent); - io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + /* + * Task needed when pages are not pinned as the application doing IO + * is not allowed to write into fuse-server pages. + * Additionally for IO through io-uring as issue flags are unknown then. + * backgrounds requests might hold spin-locks, that conflict with + * io_uring_cmd_done() mutex lock. + */ + if (!ent->header_pages || current->io_uring || bg) { + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); + } else { + int err = fuse_uring_prepare_send(ent, ent->fuse_req); + struct fuse_ring_queue *queue = ent->queue; + + if (err) { + fuse_uring_next_fuse_req(ent, queue, + IO_URING_F_UNLOCKED); + return; + } + fuse_uring_send(ent, cmd, 0, IO_URING_F_UNLOCKED); + } } /* queue a fuse request and send it if a ring entry is available */ @@ -1272,7 +1613,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring, false); if (!queue) goto err; @@ -1287,14 +1628,17 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); + queue->nr_reqs++; + if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); if (ent) - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, false); return; @@ -1313,7 +1657,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_select_queue(ring, true); if (!queue) return false; @@ -1326,6 +1670,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) set_bit(FR_URING, &req->flags); req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); + queue->nr_reqs++; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); @@ -1333,7 +1678,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); /* @@ -1347,7 +1692,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fuse_uring_add_req_to_ring_ent(ent, req); spin_unlock(&queue->lock); - fuse_uring_dispatch_ent(ent); + fuse_uring_dispatch_ent(ent, true); } else { spin_unlock(&queue->lock); } @@ -1358,8 +1703,16 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) bool fuse_uring_remove_pending_req(struct fuse_req *req) { struct fuse_ring_queue *queue = req->ring_queue; + bool removed = fuse_remove_pending_req(req, &queue->lock); + + if (removed) { + /* Update counters after successful removal */ + spin_lock(&queue->lock); + queue->nr_reqs--; + spin_unlock(&queue->lock); + } - return fuse_remove_pending_req(req, &queue->lock); + return removed; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 51a563922ce141..4518990e98bdd5 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -40,7 +40,11 @@ enum fuse_ring_req_state { struct fuse_ring_ent { /* userspace buffer */ struct fuse_uring_req_header __user *headers; + struct page **header_pages; + int nr_header_pages; void __user *payload; + struct page **payload_pages; + int nr_payload_pages; /* the ring queue that owns the request */ struct fuse_ring_queue *queue; @@ -94,6 +98,9 @@ struct fuse_ring_queue { /* background fuse requests */ struct list_head fuse_req_bg_queue; + /* number of requests queued or in userspace */ + unsigned int nr_reqs; + struct fuse_pqueue fpq; unsigned int active_background; @@ -101,6 +108,17 @@ struct fuse_ring_queue { bool stopped; }; +struct fuse_queue_map { + /* Tracks which queues are registered */ + cpumask_var_t registered_q_mask; + + /* number of registered queues */ + size_t nr_queues; + + /* cpu to qid mapping */ + int *cpu_to_qid; +}; + /** * Describes if uring is for communication and holds alls the data needed * for uring communication @@ -110,7 +128,10 @@ struct fuse_ring { struct fuse_conn *fc; /* number of ring queues */ - size_t nr_queues; + size_t max_nr_queues; + + /* number of numa nodes */ + int nr_numa_nodes; /* maximum payload/arg size */ size_t max_payload_sz; @@ -122,6 +143,12 @@ struct fuse_ring { */ unsigned int stop_debug_log : 1; + /* per numa node queue tracking */ + struct fuse_queue_map *numa_q_map; + + /* all queue tracking */ + struct fuse_queue_map q_map; + wait_queue_head_t stop_waitq; /* async tear down */ @@ -138,7 +165,7 @@ struct fuse_ring { bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); -void fuse_uring_abort_end_requests(struct fuse_ring *ring); +void fuse_uring_flush_bg(struct fuse_conn *fc); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); @@ -152,10 +179,8 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) if (ring == NULL) return; - if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_abort_end_requests(ring); - fuse_uring_stop_queues(ring); - } + fuse_uring_flush_bg(fc); + fuse_uring_stop_queues(ring); } static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) @@ -206,6 +231,14 @@ static inline bool fuse_uring_request_expired(struct fuse_conn *fc) return false; } +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_flush_bg(struct fuse_conn *fc) +{ +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 7ac6b232ef1232..db3d2a737a7e0e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -1492,14 +1493,7 @@ static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.opcode = FUSE_GETATTR; - args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + fuse_getattr_args_fill(&args, get_node_id(inode), &inarg, &outarg); err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || @@ -2181,6 +2175,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * truncation has already been done by OPEN. But still * need to truncate page cache. */ + if (fc->dlm && fc->writeback_cache) + fuse_dlm_cache_release_locks(fi); i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; @@ -2286,6 +2282,9 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (fc->dlm && fc->writeback_cache) + fuse_dlm_unlock_range(fi, outarg.attr.size & PAGE_MASK, -1); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 676fd9856bfbf3..170cf0b778189c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -23,6 +24,41 @@ #include #include +int sb_init_dio_done_wq(struct super_block *sb); + +/* + * Helper function to initialize fuse_args for OPEN/OPENDIR operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg) +{ + args->opcode = opcode; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + +/* + * Helper function to initialize fuse_args for GETATTR operations + */ +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg) +{ + args->opcode = FUSE_GETATTR; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; +} + static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, struct fuse_open_out *outargp) @@ -40,14 +76,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = opcode; - args.nodeid = nodeid; - args.in_numargs = 1; - args.in_args[0].size = sizeof(inarg); - args.in_args[0].value = &inarg; - args.out_numargs = 1; - args.out_args[0].size = sizeof(*outargp); - args.out_args[0].value = outargp; + fuse_open_args_fill(&args, nodeid, opcode, &inarg, outargp); return fuse_simple_request(fm, &args); } @@ -126,8 +155,66 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) } } +static int fuse_compound_open_getattr(struct fuse_mount *fm, u64 nodeid, + int flags, int opcode, + struct fuse_file *ff, + struct fuse_attr_out *outattrp, + struct fuse_open_out *outopenp) +{ + struct fuse_compound_req *compound; + struct fuse_args open_args = {}; + struct fuse_args getattr_args = {}; + struct fuse_open_in open_in = {}; + struct fuse_getattr_in getattr_in = {}; + int err; + + compound = fuse_compound_alloc(fm, 0); + if (IS_ERR(compound)) + return PTR_ERR(compound); + + open_in.flags = flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) + open_in.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (open_in.flags & O_TRUNC) && !capable(CAP_FSETID)) + open_in.open_flags |= FUSE_OPEN_KILL_SUIDGID; + + fuse_open_args_fill(&open_args, nodeid, opcode, &open_in, outopenp); + + err = fuse_compound_add(compound, &open_args); + if (err) + goto out; + + fuse_getattr_args_fill(&getattr_args, nodeid, &getattr_in, outattrp); + + err = fuse_compound_add(compound, &getattr_args); + if (err) + goto out; + + err = fuse_compound_send(compound); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 0); + if (err) + goto out; + + err = fuse_compound_get_error(compound, 1); + if (err) + goto out; + + ff->fh = outopenp->fh; + ff->open_flags = outopenp->open_flags; + +out: + kfree(compound); + return err; +} + struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir) + struct inode *inode, + unsigned int open_flags, bool isdir) { struct fuse_conn *fc = fm->fc; struct fuse_file *ff; @@ -153,23 +240,46 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, if (open) { /* Store outarg for fuse_finish_open() */ struct fuse_open_out *outargp = &ff->args->open_outarg; - int err; + int err = -ENOSYS; + + if (inode && fc->compound_open_getattr) { + + struct fuse_attr_out attr_outarg; + + err = fuse_compound_open_getattr(fm, nodeid, open_flags, + opcode, ff, + &attr_outarg, outargp); + if (err == -ENOSYS) + fc->compound_open_getattr = 0; + if (!err) + fuse_change_attributes(inode, &attr_outarg.attr, + NULL, + ATTR_TIMEOUT(&attr_outarg), + fuse_get_attr_version(fc)); + } + if (err == -ENOSYS) { + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); + if (!err) { + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; + } + } - err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); - if (!err) { - ff->fh = outargp->fh; - ff->open_flags = outargp->open_flags; - } else if (err != -ENOSYS) { - fuse_file_free(ff); - return ERR_PTR(err); - } else { - if (isdir) { + if (err) { + if (err != -ENOSYS) { + /* err is not ENOSYS */ + fuse_file_free(ff); + return ERR_PTR(err); + } else { /* No release needed */ kfree(ff->args); ff->args = NULL; - fc->no_opendir = 1; - } else { - fc->no_open = 1; + + /* we don't have open */ + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } } @@ -185,11 +295,10 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { - struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + struct fuse_file *ff = fuse_file_open(fm, nodeid, file_inode(file), file->f_flags, isdir); if (!IS_ERR(ff)) file->private_data = ff; - return PTR_ERR_OR_ZERO(ff); } EXPORT_SYMBOL_GPL(fuse_do_open); @@ -629,6 +738,19 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) return io->bytes < 0 ? io->size : io->bytes; } +static void fuse_aio_invalidate_worker(struct work_struct *work) +{ + struct fuse_io_priv *io = container_of(work, struct fuse_io_priv, work); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; + ssize_t res = fuse_get_res_by_io(io); + pgoff_t start = io->offset >> PAGE_SHIFT; + pgoff_t end = (io->offset + res - 1) >> PAGE_SHIFT; + + invalidate_inode_pages2_range(mapping, start, end); + io->iocb->ki_complete(io->iocb, res); + kref_put(&io->refcnt, fuse_io_release); +} + /* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested @@ -661,10 +783,11 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&io->lock); if (!left && !io->blocking) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct address_space *mapping = io->iocb->ki_filp->f_mapping; ssize_t res = fuse_get_res_by_io(io); if (res >= 0) { - struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -673,6 +796,17 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fi->lock); } + if (io->write && res > 0 && mapping->nrpages) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + INIT_WORK(&io->work, fuse_aio_invalidate_worker); + queue_work(inode->i_sb->s_dio_done_wq, &io->work); + return; + } + io->iocb->ki_complete(io->iocb, res); } @@ -835,8 +969,11 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); - if (res < 0) + if (res < 0) { + if (res == -EAGAIN) + res = AOP_TRUNCATED_PAGE; return res; + } /* * Short read means EOF. If file size is larger, truncate it */ @@ -1489,6 +1626,17 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!fc->handle_killpriv_v2 || !setattr_should_drop_suidgid(idmap, file_inode(file))) writeback = true; + /* if we have dlm support acquire the lock for the area + * we are writing into */ + if (fc->dlm) { + /* note that a file opened with O_APPEND will have relative values + * in ki_pos. This code is here for convenience and for libfuse overlay test. + * Filesystems should handle O_APPEND with 'direct io' to additionally + * get the performance benefits of 'parallel direct writes'. */ + loff_t pos = file->f_flags & O_APPEND ? i_size_read(inode) + iocb->ki_pos : iocb->ki_pos; + size_t length = iov_iter_count(from); + fuse_get_dlm_write_lock(file, pos, length); + } } inode_lock(inode); @@ -1738,15 +1886,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; - if (res > 0 && write && fopen_direct_io) { - /* - * As in generic_file_direct_write(), invalidate after the - * write, to invalidate read-ahead cache that may have competed - * with the write. - */ - invalidate_inode_pages2_range(mapping, idx_from, idx_to); - } - return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); @@ -1785,6 +1924,8 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct address_space *mapping = inode->i_mapping; + loff_t pos = iocb->ki_pos; ssize_t res; bool exclusive; @@ -1801,6 +1942,16 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); } + if (res > 0 && mapping->nrpages) { + /* + * As in generic_file_direct_write(), invalidate after + * write, to invalidate read-ahead cache that may have + * with the write. + */ + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, + (pos + res - 1) >> PAGE_SHIFT); + } } fuse_dio_unlock(iocb, exclusive); @@ -2327,6 +2478,60 @@ static void fuse_vma_close(struct vm_area_struct *vma) mapping_set_error(vma->vm_file->f_mapping, err); } +/** + * Request a DLM lock from the FUSE server. + * + * This routine is similar to fuse_get_dlm_write_lock(), but it + * does not cache the DLM lock in the kernel. + */ +static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = ff->fm; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + if (WARN_ON_ONCE((offset & ~PAGE_MASK) || (length & ~PAGE_MASK))) + return -EIO; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.start = offset; + inarg.end = offset + length - 1; + inarg.type = FUSE_DLM_PAGE_MKWRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->dlm = 0; + err = 0; + } + + if (!err && + fc->dlm && + (outarg.start > inarg.start || + outarg.end < inarg.end)) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu:%llu bytes returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); + fuse_abort_conn(fc); + err = -EINVAL; + } + return err; +} /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly @@ -2345,7 +2550,18 @@ static void fuse_vma_close(struct vm_area_struct *vma) static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); - struct inode *inode = file_inode(vmf->vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + + if (fm->fc->dlm) { + loff_t pos = vmf->pgoff << PAGE_SHIFT; + size_t length = PAGE_SIZE; + int err = fuse_get_page_mkwrite_lock(file, pos, length); + if (err < 0) { + return vmf_error(err); + } + } file_update_time(vmf->vma->vm_file); folio_lock(folio); @@ -2826,6 +3042,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter), shortened = 0; loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; + bool async = ff->fm->fc->async_dio; pos = offset; inode = file->f_mapping->host; @@ -2834,6 +3051,12 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) return 0; + if ((iov_iter_rw(iter) == WRITE) && async && !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + return ret; + } + io = kmalloc_obj(struct fuse_io_priv); if (!io) return -ENOMEM; @@ -2849,7 +3072,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fm->fc->async_dio; + io->async = async; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); @@ -3206,6 +3429,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); + fuse_dlm_cache_init(fi); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); @@ -3213,4 +3437,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); + + if (enable_large_folios) + mapping_set_large_folios(inode->i_mapping); } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 134bf44aff0d39..4037fd7bdeee66 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -36,6 +36,8 @@ struct fuse_copy_state { bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ + struct page **pages; + int page_idx; } ring; }; diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c new file mode 100644 index 00000000000000..d765dd8018cc6a --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FUSE page lock cache implementation + */ +#include "fuse_i.h" +#include "fuse_dlm_cache.h" + +#include +#include +#include +#include + + +/* A range of pages with a lock */ +struct fuse_dlm_range { + /* Interval tree node */ + struct rb_node rb; + /* Start page offset (inclusive) */ + uint64_t start; + /* End page offset (inclusive) */ + uint64_t end; + /* Subtree end value for interval tree */ + uint64_t __subtree_end; + /* Lock mode */ + enum fuse_page_lock_mode mode; + /* Temporary list entry for operations */ + struct list_head list; +}; + +/* Lock modes for FUSE page cache */ +#define FUSE_PCACHE_LK_READ 1 /* Shared read lock */ +#define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ + +/* Interval tree definitions for page ranges */ +static inline uint64_t fuse_dlm_range_start(struct fuse_dlm_range *range) +{ + return range->start; +} + +static inline uint64_t fuse_dlm_range_last(struct fuse_dlm_range *range) +{ + return range->end; +} + +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, uint64_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); + +/** + * fuse_page_cache_init - Initialize a page cache lock manager + * @cache: The cache to initialize + * + * Initialize a page cache lock manager for a FUSE inode. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_cache_init(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + + if (!cache) + return -EINVAL; + + init_rwsem(&cache->lock); + cache->ranges = RB_ROOT_CACHED; + + return 0; +} + +/** + * fuse_page_cache_destroy - Clean up a page cache lock manager + * @cache: The cache to clean up + * + * Release all locks and free all resources associated with the cache. + */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + struct rb_node *node; + + if (!cache) + return; + + /* Release all locks */ + down_write(&cache->lock); + while ((node = rb_first_cached(&cache->ranges)) != NULL) { + range = rb_entry(node, struct fuse_dlm_range, rb); + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + up_write(&cache->lock); +} + +/** + * fuse_dlm_find_overlapping - Find a range that overlaps with [start, end] + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Return: Pointer to the first overlapping range, or NULL if none found + */ +static struct fuse_dlm_range * +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) +{ + return fuse_page_it_iter_first(&cache->ranges, start, end); +} + +/** + * fuse_page_try_merge - Try to merge ranges within a specific region + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Attempt to merge ranges within and adjacent to the specified region + * that have the same lock mode. + */ +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) +{ + struct fuse_dlm_range *range, *next; + struct rb_node *node; + + if (!cache) + return; + + /* Find the first range that might need merging */ + range = NULL; + node = rb_first_cached(&cache->ranges); + while (node) { + range = rb_entry(node, struct fuse_dlm_range, rb); + if (range->end >= start - 1) + break; + node = rb_next(node); + } + + if (!range || range->start > end + 1) + return; + + /* Try to merge ranges in and around the specified region */ + while (range && range->start <= end + 1) { + /* Get next range before we potentially modify the tree */ + next = NULL; + if (rb_next(&range->rb)) { + next = rb_entry(rb_next(&range->rb), + struct fuse_dlm_range, rb); + } + + /* Try to merge with next range if adjacent and same mode */ + if (next && range->mode == next->mode && + range->end + 1 == next->start) { + /* Merge ranges */ + range->end = next->end; + + /* Remove next from tree */ + fuse_page_it_remove(next, &cache->ranges); + kfree(next); + + /* Continue with the same range */ + continue; + } + + /* Move to next range */ + range = next; + } +} + +/** + * fuse_dlm_lock_range - Lock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode (read or write) + * + * Add a locked range on the specified range of pages. + * If parts of the range are already locked, only add the remaining parts. + * For overlapping ranges, handle lock compatibility: + * - READ locks are compatible with existing READ locks + * - READ locks are compatible with existing WRITE locks (downgrade not needed) + * - WRITE locks need to upgrade existing READ locks + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *new_range, *next; + int lock_mode; + int ret = 0; + LIST_HEAD(to_lock); + LIST_HEAD(to_upgrade); + uint64_t current_start = start; + + if (!cache || start > end) + return -EINVAL; + + /* Convert to lock mode */ + lock_mode = (mode == FUSE_PAGE_LOCK_READ) ? FUSE_PCACHE_LK_READ : + FUSE_PCACHE_LK_WRITE; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check lock compatibility */ + if (lock_mode == FUSE_PCACHE_LK_WRITE && + lock_mode != range->mode) { + /* we own the lock but have to update it. */ + list_add_tail(&range->list, &to_upgrade); + } + /* If WRITE lock already exists - nothing to do */ + + /* If there's a gap before this range, we need to add the missing range */ + if (current_start < range->start) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = range->start - 1; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* Move current_start past this range */ + current_start = max(current_start, range->end + 1); + + /* Move to next range */ + range = next; + } + + /* If there's a gap after the last range to the end, extend the range */ + if (current_start <= end) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = end; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* update locks, if any lock is in this list it has the wrong mode */ + list_for_each_entry(range, &to_upgrade, list) { + /* Update the lock mode */ + range->mode = lock_mode; + } + + /* Add all new ranges to the tree */ + list_for_each_entry(new_range, &to_lock, list) { + /* Add to interval tree */ + fuse_page_it_insert(new_range, &cache->ranges); + } + + /* Try to merge adjacent ranges with the same mode */ + fuse_dlm_try_merge(cache, start, end); + + up_write(&cache->lock); + return 0; + +out_free: + /* Free any ranges we allocated but didn't insert */ + while (!list_empty(&to_lock)) { + new_range = + list_first_entry(&to_lock, struct fuse_dlm_range, list); + list_del(&new_range->list); + kfree(new_range); + } + + /* Restore original lock modes for any partially upgraded locks */ + list_for_each_entry(range, &to_upgrade, list) { + if (lock_mode == FUSE_PCACHE_LK_WRITE) { + /* We upgraded this lock but failed later, downgrade it back */ + range->mode = FUSE_PCACHE_LK_READ; + } + } + + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_punch_hole - Punch a hole in a locked range + * @cache: The page cache + * @start: Start page offset of the hole + * @end: End page offset of the hole + * + * Create a hole in a locked range by splitting it into two ranges. + * + * Return: 0 on success, negative error code on failure + */ +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, uint64_t start, + uint64_t end) +{ + struct fuse_dlm_range *range, *new_range; + int ret = 0; + + if (!cache || start > end) + return -EINVAL; + + /* Find a range that contains [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + if (!range) { + ret = -EINVAL; + goto out; + } + + /* If the hole is at the beginning of the range */ + if (start == range->start) { + range->start = end + 1; + goto out; + } + + /* If the hole is at the end of the range */ + if (end == range->end) { + range->end = start - 1; + goto out; + } + + /* The hole is in the middle, need to split */ + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out; + } + + /* Copy properties from original range */ + *new_range = *range; + INIT_LIST_HEAD(&new_range->list); + + /* Adjust ranges */ + new_range->start = end + 1; + range->end = start - 1; + + /* Update interval tree */ + fuse_page_it_remove(range, &cache->ranges); + fuse_page_it_insert(range, &cache->ranges); + fuse_page_it_insert(new_range, &cache->ranges); + +out: + return ret; +} + +/** + * fuse_dlm_unlock_range - Unlock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Release locks on the specified range of pages. + * Note that if start and end are set to zero the cache is destroyed. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, + uint64_t start, uint64_t end) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *next; + int ret = 0; + + if (!cache) + return -EINVAL; + + if (start == 0 && end == 0) { + fuse_dlm_cache_release_locks(inode); + return 0; + } + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check if we need to punch a hole */ + if (start > range->start && end < range->end) { + /* Punch a hole in the middle */ + ret = fuse_dlm_punch_hole(cache, start, end); + if (ret) + goto out; + /* After punching a hole, we're done */ + break; + } else if (start > range->start) { + /* Adjust the end of the range */ + range->end = start - 1; + } else if (end < range->end) { + /* Adjust the start of the range */ + range->start = end + 1; + } else { + /* Complete overlap, remove the range */ + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + + range = next; + } + +out: + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_range_is_locked - Check if a page range is already locked + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode to check for (or NULL to check for any lock) + * + * Check if the specified range of pages is already locked. + * The entire range must be locked for this to return true. + * + * Return: true if the entire range is locked, false otherwise + */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + int lock_mode = 0; + uint64_t current_start = start; + + if (!cache || start > end) + return false; + + /* Convert to lock mode if specified */ + if (mode == FUSE_PAGE_LOCK_READ) + lock_mode = FUSE_PCACHE_LK_READ; + else if (mode == FUSE_PAGE_LOCK_WRITE) + lock_mode = FUSE_PCACHE_LK_WRITE; + + down_read(&cache->lock); + + /* Find the first range that overlaps with [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + + /* Check if the entire range is covered */ + while (range && current_start <= end) { + /* If we're checking for a specific mode, verify it matches */ + if (lock_mode && range->mode != lock_mode) { + /* Wrong lock mode */ + up_read(&cache->lock); + return false; + } + + /* Check if there's a gap before this range */ + if (current_start < range->start) { + /* Found a gap */ + up_read(&cache->lock); + return false; + } + + /* Move current_start past this range */ + current_start = range->end + 1; + + /* Get next overlapping range */ + range = fuse_page_it_iter_next(range, start, end); + } + + /* Check if we covered the entire range */ + if (current_start <= end) { + /* There's a gap at the end */ + up_read(&cache->lock); + return false; + } + + up_read(&cache->lock); + return true; +} + +/** + * request a dlm lock from the fuse server + */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + uint64_t end = (offset + length - 1) | (PAGE_SIZE - 1); + + /* note that the offset and length don't have to be page aligned here + * but since we only get here on writeback caching we will send out + * page aligned requests */ + offset &= PAGE_MASK; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + /* note that this can be run from different processes + * at the same time. It is intentionally not protected + * since a DLM implementation in the FUSE server should take care + * of any races in lock requests */ + if (fuse_dlm_range_is_locked(fi, offset, + end, FUSE_PAGE_LOCK_WRITE)) + return; /* we already have this area locked */ + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.start = offset; + inarg.end = end; + inarg.type = FUSE_DLM_LOCK_WRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + /* fuse server does not support dlm, save the info */ + fc->dlm = 0; + return; + } + + if (err) + return; + else + if (inarg.start < outarg.start || + inarg.end > outarg.end) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu:%llu returned %llu:%llu bytes\n", + inarg.start, inarg.end, outarg.start, outarg.end); + fuse_abort_conn(fc); + return; + } else { + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, outarg.start, + outarg.end, + FUSE_PAGE_LOCK_WRITE); + } +} diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h new file mode 100644 index 00000000000000..438d31d28b666e --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * FUSE page cache lock implementation + */ + +#ifndef _FS_FUSE_DLM_CACHE_H +#define _FS_FUSE_DLM_CACHE_H + +#include +#include +#include +#include + + +struct fuse_inode; + +/* Lock modes for page ranges */ +enum fuse_page_lock_mode { FUSE_PAGE_LOCK_READ, FUSE_PAGE_LOCK_WRITE }; + +/* Page cache lock manager */ +struct fuse_dlm_cache { + /* Lock protecting the tree */ + struct rw_semaphore lock; + /* Interval tree of locked ranges */ + struct rb_root_cached ranges; +}; + +/* Initialize a page cache lock manager */ +int fuse_dlm_cache_init(struct fuse_inode *inode); + +/* Clean up a page cache lock manager */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode); + +/* Lock a range of pages */ +int fuse_dlm_lock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); + +/* Unlock a range of pages */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, uint64_t start, + uint64_t end); + +/* Check if a page range is already locked */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, uint64_t start, + uint64_t end, enum fuse_page_lock_mode mode); + +/* this is the interface to the filesystem */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length); + +#endif /* _FS_FUSE_DLM_CACHE_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7f16049387d15e..d6b68d12be9307 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -31,6 +31,7 @@ #include #include #include +#include "fuse_dlm_cache.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -83,6 +84,7 @@ extern struct mutex fuse_mutex; /** Module parameters */ extern unsigned int max_user_bgreq; extern unsigned int max_user_congthresh; +extern bool enable_large_folios; /* One forget request */ struct fuse_forget_link { @@ -113,6 +115,17 @@ struct fuse_backing { struct rcu_head rcu; }; +/** + * data structure to save the information that we have + * requested dlm locks for the given area from the fuse server +*/ +struct dlm_locked_area +{ + struct list_head list; + loff_t offset; + size_t size; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -168,6 +181,9 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; + + /* dlm locked areas we have sent lock requests for */ + struct fuse_dlm_cache dlm_locked_areas; }; /* readdir cache (directory only) */ @@ -377,6 +393,7 @@ union fuse_file_args { /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { struct kref refcnt; + struct work_struct work; int async; spinlock_t lock; unsigned reqs; @@ -772,6 +789,12 @@ struct fuse_conn { */ unsigned handle_killpriv_v2:1; + /* invalidate inode entries when doing inode invalidation */ + unsigned inval_inode_entries:1; + + /* expire inode entries when doing inode invalidation */ + unsigned expire_inode_entries:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -909,6 +932,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* do we have support for dlm in the fs? */ + unsigned int dlm:1; + /** Passthrough support for read/write IO */ unsigned int passthrough:1; @@ -924,6 +950,9 @@ struct fuse_conn { /* Use io_uring for communication */ unsigned int io_uring; + + /* Does the filesystem support compound operations? */ + unsigned int compound_open_getattr:1; /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -945,6 +974,9 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Waitqueue for attr_version initialization */ + wait_queue_head_t attr_version_waitq; + /** Version counter for evict inode */ atomic64_t evict_ctr; @@ -1179,6 +1211,14 @@ struct fuse_io_args { void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); +/* + * Helper functions to initialize fuse_args for common operations + */ +void fuse_open_args_fill(struct fuse_args *args, u64 nodeid, int opcode, + struct fuse_open_in *inarg, struct fuse_open_out *outarg); +void fuse_getattr_args_fill(struct fuse_args *args, u64 nodeid, + struct fuse_getattr_in *inarg, + struct fuse_attr_out *outarg); struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); @@ -1270,6 +1310,8 @@ static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, return __fuse_simple_request(idmap, fm, args); } +ssize_t fuse_compound_request(struct fuse_mount *fm, struct fuse_args *args); + int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); @@ -1277,6 +1319,14 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, * Assign a unique id to a fuse request */ void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req); +struct fuse_compound_req; + +struct fuse_compound_req *fuse_compound_alloc(struct fuse_mount *fm, uint32_t flags); +int fuse_compound_add(struct fuse_compound_req *compound, + struct fuse_args *args); +ssize_t fuse_compound_send(struct fuse_compound_req *compound); +int fuse_compound_get_error(struct fuse_compound_req * compound, + int op_idx); /** * End a finished request @@ -1295,6 +1345,12 @@ void fuse_dentry_tree_cleanup(void); void fuse_epoch_work(struct work_struct *work); +/** + * Flush all pending requests and wait for them. Takes an optional timeout + * in jiffies. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout); + /** * Invalidate inode attributes */ @@ -1543,7 +1599,9 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); /* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, - unsigned int open_flags, bool isdir); + struct inode *inode, + unsigned int open_flags, + bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index bbe9ddd8c71696..e81c93b9614627 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,6 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* @@ -77,30 +78,55 @@ OPCODES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -TRACE_EVENT(fuse_request_send, +#define FUSE_REQ_TRACE_FIELDS \ + __field(dev_t, connection) \ + __field(uint64_t, unique) \ + __field(enum fuse_opcode, opcode) \ + __field(uint32_t, len) \ + +#define FUSE_REQ_TRACE_ASSIGN(req) \ + do { \ + __entry->connection = req->fm->fc->dev; \ + __entry->unique = req->in.h.unique; \ + __entry->opcode = req->in.h.opcode; \ + __entry->len = req->in.h.len; \ + } while (0) + + +TRACE_EVENT(fuse_request_enqueue, TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_bg_enqueue, + TP_PROTO(const struct fuse_req *req), TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), - TP_STRUCT__entry( - __field(dev_t, connection) - __field(uint64_t, unique) - __field(enum fuse_opcode, opcode) - __field(uint32_t, len) - ), + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); - TP_fast_assign( - __entry->connection = req->fm->fc->dev; - __entry->unique = req->in.h.unique; - __entry->opcode = req->in.h.opcode; - __entry->len = req->in.h.len; - ), +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + TP_ARGS(req), + TP_STRUCT__entry(FUSE_REQ_TRACE_FIELDS), + TP_fast_assign(FUSE_REQ_TRACE_ASSIGN(req)), TP_printk("connection %u req %llu opcode %u (%s) len %u ", __entry->connection, __entry->unique, __entry->opcode, __print_symbolic(__entry->opcode, OPCODES), __entry->len) ); + TRACE_EVENT(fuse_request_end, TP_PROTO(const struct fuse_req *req), diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c795abe47a4f4a..25fe7f81815b26 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include "fuse_dev_i.h" #include "dev_uring_i.h" @@ -32,6 +33,14 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); +static bool __read_mostly enable_compound; +module_param(enable_compound, bool, 0644); +MODULE_PARM_DESC(enable_uring, "Enable fuse compounds"); + +bool __read_mostly enable_large_folios = true; +module_param(enable_large_folios, bool, 0644); +MODULE_PARM_DESC(enable_large_folios, "Enable large folios support"); + static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); @@ -39,7 +48,7 @@ DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); -unsigned int fuse_max_pages_limit = 256; +unsigned int fuse_max_pages_limit = 4097; /* default is no timeout */ unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; @@ -195,6 +204,7 @@ static void fuse_evict_inode(struct inode *inode) WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); + fuse_dlm_cache_release_locks(fi); } } @@ -246,6 +256,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); fi->attr_version = atomic64_inc_return(&fc->attr_version); + wake_up_all(&fc->attr_version_waitq); fi->i_time = attr_valid; inode->i_ino = fuse_squash_ino(attr->ino); @@ -553,6 +564,45 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, return NULL; } +static void fuse_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); +} + +static void fuse_invalidate_inode_entry(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + /* For directories, use d_invalidate to handle children and submounts */ + dentry = d_find_alias(inode); + if (dentry) { + d_invalidate(dentry); + fuse_invalidate_entry_cache(dentry); + dput(dentry); + } + } else { + /* For regular files, just unhash the dentry */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + } +} + int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { @@ -566,18 +616,42 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return -ENOENT; fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + while (fi->attr_version == 0) { + spin_unlock(&fi->lock); + wait_event(fc->attr_version_waitq, READ_ONCE(fi->attr_version) != 0); + spin_lock(&fi->lock); + } + fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); + + if (fc->inval_inode_entries) + fuse_invalidate_inode_entry(inode); + else if (fc->expire_inode_entries) + fuse_prune_aliases(inode); fuse_invalidate_attr(inode); forget_all_cached_acls(inode); + security_inode_invalidate_secctx(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; + + if (fc->dlm && fc->writeback_cache) + /* Invalidate the range exactly as the fuse server requested + * except for the case where it sends -1. + * Note that this can lead to some inconsistencies if + * the fuse server sends unaligned data */ + fuse_dlm_unlock_range(fi, + offset, + pg_end == -1 ? 0 : + (offset + len - 1)); + invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -979,6 +1053,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, atomic_set(&fc->epoch, 1); INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->attr_version_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); @@ -991,6 +1066,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->blocked = 0; fc->initialized = 0; fc->connected = 1; + fc->dlm = 1; + + /* module option for now */ + fc->compound_open_getattr = enable_compound; + atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); @@ -1456,6 +1536,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if (flags & FUSE_REQUEST_TIMEOUT) timeout = arg->request_timeout; + if (flags & FUSE_INVAL_INODE_ENTRY) + fc->inval_inode_entries = 1; + if (flags & FUSE_EXPIRE_INODE_ENTRY) + fc->expire_inode_entries = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1464,7 +1548,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, init_server_timeout(fc, timeout); - fm->sb->s_bdi->ra_pages = + if (CAP_SYS_ADMIN) + fm->sb->s_bdi->ra_pages = ra_pages; + else + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; @@ -1492,8 +1579,7 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; - flags = - FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -1505,7 +1591,9 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_INVAL_INODE_ENTRY | + FUSE_EXPIRE_INODE_ENTRY | FUSE_URING_REDUCED_Q | + FUSE_EXPIRE_INODE_ENTRY | FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) @@ -2087,6 +2175,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + fuse_flush_requests(fc, 30 * HZ); if (fc->destroy) fuse_send_destroy(fm); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fdc175e93f7474..07a02e47b2c3a6 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -494,7 +494,7 @@ static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) if (!S_ISREG(inode->i_mode) && !isdir) return ERR_PTR(-ENOTTY); - return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); + return fuse_file_open(fm, get_node_id(inode), NULL, O_RDONLY, isdir); } static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c13e1f9a2f12bd..30bb854fbc9408 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -448,6 +448,10 @@ struct fuse_file_lock { * FUSE_OVER_IO_URING: Indicate that client supports io-uring * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. * init_out.request_timeout contains the timeout (in secs) + * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation + * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation + * FUSE_URING_REDUCED_Q: Client (kernel) supports less queues - Server is free + * to register between 1 and nr-core io-uring queues */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -495,6 +499,10 @@ struct fuse_file_lock { #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) #define FUSE_REQUEST_TIMEOUT (1ULL << 42) +#define FUSE_ALIGN_PG_ORDER (1ULL << 50) +#define FUSE_URING_REDUCED_Q (1ULL << 59) +#define FUSE_INVAL_INODE_ENTRY (1ULL << 60) +#define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) /** * CUSE INIT request/reply flags @@ -662,7 +670,17 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, - FUSE_COPY_FILE_RANGE_64 = 53, + FUSE_COPY_FILE_RANGE_64 = 53, + + /* Operations which have not been merged into upstream */ + FUSE_DLM_WB_LOCK = 100, + + /* A compound request works like multiple simple requests. + * This is a special case for calls that can be combined atomic on the + * fuse server. If the server actually does atomically execute the command is + * left to the fuse server implementation. + */ + FUSE_COMPOUND = 101, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1245,6 +1263,74 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Type of the dlm lock requested + */ +enum fuse_dlm_lock_type { + FUSE_DLM_LOCK_NONE = 0, + FUSE_DLM_LOCK_READ = 1, + FUSE_DLM_LOCK_WRITE = 2, + FUSE_DLM_PAGE_MKWRITE = 3, +}; + +/** + * struct fuse_dlm_lock_in - Lock request + * @fh: file handle + * @offset: offset into the file + * @size: size of the locked region + * @type: type of lock + */ +struct fuse_dlm_lock_in { + uint64_t fh; + uint64_t start; + uint64_t end; + uint32_t type; + uint32_t reserved; +}; + + +/** + * struct fuse_dlm_lock_out - Lock response + * @locksize: how many bytes where locked by the call + * (most of the time we want to lock more than is requested + * to reduce number of calls) + */ +struct fuse_dlm_lock_out { + uint64_t start; + uint64_t end; + uint64_t reserved; +}; + +/* + * Compound request header + * + * This header is followed by the fuse requests + */ +struct fuse_compound_in { + uint32_t count; /* Number of operations */ + uint32_t flags; /* Compound flags */ + + /* Total size of all results. + * This is needed for preallocating the whole result for all + * commands in this compound. + */ + uint32_t result_size; + uint64_t reserved; +}; + +/* + * Compound response header + * + * This header is followed by complete fuse responses + */ +struct fuse_compound_out { + uint32_t count; /* Number of results */ + uint32_t flags; /* Result flags */ + uint64_t reserved; +}; + +#define FUSE_MAX_COMPOUND_OPS 16 /* Maximum operations per compound */ + /** * Size of the ring buffer header */ diff --git a/ubuntu/igh-ecat/master/Makefile b/ubuntu/igh-ecat/master/Makefile index 8aef742ebb2f74..4f8fc539c6ab6c 100644 --- a/ubuntu/igh-ecat/master/Makefile +++ b/ubuntu/igh-ecat/master/Makefile @@ -1,5 +1,5 @@ ccflags-y := -I$(src)/../ \ - -Wmaybe-uninitialized + -Wuninitialized obj-$(CONFIG_IGH_ECAT) += ec_master.o