Skip to content

Commit 07754bf

Browse files
committed
io_uring: enable toggle of iowait usage when waiting on CQEs
By default, io_uring marks a waiting task as being in iowait, if it's sleeping waiting on events and there are pending requests. This isn't necessarily always useful, and may be confusing on non-storage setups where iowait isn't expected. It can also cause extra power usage, by preventing the CPU from entering lower sleep states. This adds a new enter flag, IORING_ENTER_NO_IOWAIT. If set, then io_uring will not account the sleeping task as being in iowait. If the kernel supports this feature, then it will be marked by having the IORING_FEAT_NO_IOWAIT feature flag set. As the kernel currently does not support separating the iowait accounting and CPU frequency boosting, the IORING_ENTER_NO_IOWAIT controls both of these at the same time. In the future, if those do end up being split, then it'd be possible to control them separately. However, it seems more likely that the kernel will decouple iowait and CPU frequency boosting anyway. Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 96af5af commit 07754bf

2 files changed

Lines changed: 21 additions & 13 deletions

File tree

include/uapi/linux/io_uring.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ struct io_cqring_offsets {
541541
#define IORING_ENTER_REGISTERED_RING (1U << 4)
542542
#define IORING_ENTER_ABS_TIMER (1U << 5)
543543
#define IORING_ENTER_EXT_ARG_REG (1U << 6)
544+
#define IORING_ENTER_NO_IOWAIT (1U << 7)
544545

545546
/*
546547
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -578,6 +579,7 @@ struct io_uring_params {
578579
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
579580
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
580581
#define IORING_FEAT_RW_ATTR (1U << 16)
582+
#define IORING_FEAT_NO_IOWAIT (1U << 17)
581583

582584
/*
583585
* io_uring_register(2) opcodes and arguments

io_uring/io_uring.c

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2485,8 +2485,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
24852485
return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
24862486
}
24872487

2488+
struct ext_arg {
2489+
size_t argsz;
2490+
struct timespec64 ts;
2491+
const sigset_t __user *sig;
2492+
ktime_t min_time;
2493+
bool ts_set;
2494+
bool iowait;
2495+
};
2496+
24882497
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
24892498
struct io_wait_queue *iowq,
2499+
struct ext_arg *ext_arg,
24902500
ktime_t start_time)
24912501
{
24922502
int ret = 0;
@@ -2496,7 +2506,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
24962506
* can take into account that the task is waiting for IO - turns out
24972507
* to be important for low QD IO.
24982508
*/
2499-
if (current_pending_io())
2509+
if (ext_arg->iowait && current_pending_io())
25002510
current->in_iowait = 1;
25012511
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
25022512
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
@@ -2509,6 +2519,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
25092519
/* If this returns > 0, the caller should retry */
25102520
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
25112521
struct io_wait_queue *iowq,
2522+
struct ext_arg *ext_arg,
25122523
ktime_t start_time)
25132524
{
25142525
if (unlikely(READ_ONCE(ctx->check_cq)))
@@ -2522,17 +2533,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
25222533
if (unlikely(io_should_wake(iowq)))
25232534
return 0;
25242535

2525-
return __io_cqring_wait_schedule(ctx, iowq, start_time);
2536+
return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time);
25262537
}
25272538

2528-
struct ext_arg {
2529-
size_t argsz;
2530-
struct timespec64 ts;
2531-
const sigset_t __user *sig;
2532-
ktime_t min_time;
2533-
bool ts_set;
2534-
};
2535-
25362539
/*
25372540
* Wait until events become available, if we don't already have some. The
25382541
* application must reap them itself, as they reside on the shared cq ring.
@@ -2610,7 +2613,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
26102613
TASK_INTERRUPTIBLE);
26112614
}
26122615

2613-
ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
2616+
ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time);
26142617
__set_current_state(TASK_RUNNING);
26152618
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
26162619

@@ -3261,6 +3264,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
32613264
const struct io_uring_getevents_arg __user *uarg = argp;
32623265
struct io_uring_getevents_arg arg;
32633266

3267+
ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT);
3268+
32643269
/*
32653270
* If EXT_ARG isn't set, then we have no timespec and the argp pointer
32663271
* is just a pointer to the sigset_t.
@@ -3338,7 +3343,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
33383343
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
33393344
IORING_ENTER_REGISTERED_RING |
33403345
IORING_ENTER_ABS_TIMER |
3341-
IORING_ENTER_EXT_ARG_REG)))
3346+
IORING_ENTER_EXT_ARG_REG |
3347+
IORING_ENTER_NO_IOWAIT)))
33423348
return -EINVAL;
33433349

33443350
/*
@@ -3752,7 +3758,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
37523758
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
37533759
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
37543760
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
3755-
IORING_FEAT_RW_ATTR;
3761+
IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT;
37563762

37573763
if (copy_to_user(params, p, sizeof(*p))) {
37583764
ret = -EFAULT;

0 commit comments

Comments
 (0)