Skip to content

Commit dad10c2

Browse files
krekre
authored andcommitted
PR bin/59996 - handle hidden swap list reordering
A different attempt to achieve what 2 revs back was attempting. The swap lists must be locked (uvm_swap_data_lock) when we are traversing the lists of swap devices, as otherwise the lists can reorder themselves behind our back. But we cannot hold that lock when actually doing the processing, as our process might need to page/swap to copy out data, and doing that will also attempt to take the lock - panic (or doom). Instead, traverse the lists with the lock held, so they are stable, but do nothing but keep a record of all of the swapdevs (independent of their lists) and then use this new list of swapdevs to actually do the work. The number or identity of the swap devices cannot change during all of this, as we also hold swap_syscall_lock which prevents any other swapctl() operations (like adding or deleting devices) from occurring. Once we have done that, the number of swap devices found is the number returned from swapctl(SWAP_STATS) (provided it is no bigger than requested). Note that this does not guarantee that the number of devices returned from swapctl(SWAP_STATS) will agree with an earlier call to swapctl(SWAP_NSWAP) - that is obviously impossible, absolutely anything might have occurred between the two calls.
1 parent 9b24c1e commit dad10c2

1 file changed

Lines changed: 144 additions & 23 deletions

File tree

sys/uvm/uvm_swap.c

Lines changed: 144 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $NetBSD: uvm_swap.c,v 1.213 2026/02/12 13:34:45 kre Exp $ */
1+
/* $NetBSD: uvm_swap.c,v 1.214 2026/02/13 03:44:49 kre Exp $ */
22

33
/*
44
* Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
@@ -30,7 +30,7 @@
3030
*/
3131

3232
#include <sys/cdefs.h>
33-
__KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.213 2026/02/12 13:34:45 kre Exp $");
33+
__KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.214 2026/02/13 03:44:49 kre Exp $");
3434

3535
#include "opt_uvmhist.h"
3636
#include "opt_compat_netbsd.h"
@@ -779,9 +779,11 @@ uvm_swap_stats(char *ptr, int misc,
779779
register_t *retval)
780780
{
781781
struct swappri *spp;
782-
struct swapdev *sdp;
782+
struct swapdev *sdp, **sdps, **sp;
783783
struct swapent sep;
784-
int count = 0;
784+
size_t sdpsize = 0;
785+
struct swapdev *stackbuf[8];
786+
int count, slots;
785787
int error;
786788

787789
KASSERT(len <= sizeof(sep));
@@ -794,34 +796,153 @@ uvm_swap_stats(char *ptr, int misc,
794796
if (misc == 0 || uvmexp.nswapdev == 0)
795797
return 0;
796798

797-
/* Make sure userland cannot exhaust kernel memory */
798-
if ((size_t)misc > (size_t)uvmexp.nswapdev)
799-
misc = uvmexp.nswapdev;
799+
KASSERT(rw_lock_held(&swap_syscall_lock));
800+
801+
/*
802+
* Allocate space for pointers to all swapdevs
803+
*
804+
* This needs to be done here (not earlier) (and so needs
805+
* the unlock/lock dance) because of the way the various
806+
* compat functions work.
807+
*/
808+
sdps = NULL;
809+
slots = uvmexp.nswapdev;
810+
811+
if (slots > misc) /* we never need more than requested */
812+
slots = misc;
813+
814+
/*
815+
* Nb: do not limit misc to <= uvmexp.nswapdev yet,
816+
* as the latter might get bigger (or smaller)
817+
*/
818+
819+
if ((SIZE_T_MAX / sizeof sdp) <= misc) /* unlikely */
820+
return E2BIG;
821+
822+
/*
823+
* One slot for each currently existing swap device,
824+
* but limited no more than the request wants (misc).
825+
*/
826+
sdpsize = (size_t)slots * sizeof sdp;
827+
828+
/*
829+
* Borrow from kmem_tmpbuf_alloc() but don't use it
830+
* so we don't need to do the unlock dance unnecessarily
831+
*/
832+
if (sdpsize <= sizeof stackbuf) {
833+
sdps = stackbuf;
834+
} else {
835+
rw_exit(&swap_syscall_lock);
836+
837+
sdps = kmem_alloc(sdpsize, KM_SLEEP);
838+
839+
rw_enter(&swap_syscall_lock, RW_READER);
840+
841+
/*
842+
* At this point, 3 possibilities.
843+
*
844+
* uvmexp.nswapdev has increased, as a
845+
* new swap device got added. That's OK,
846+
* just ignore the excess device, and
847+
* return the first N (the number that
848+
* were there when we started.
849+
*
850+
* uvmexp.nswapdev has decreased, a swap
851+
* device was deleted. In this case we
852+
* will return less devices than requested
853+
* but that's OK. We will have more memory
854+
* than is needed to save them all, but
855+
* just a little more, and it gets freed
856+
* just below.
857+
*
858+
* uvmexp.nswapdev hasn't changed. This
859+
* will be the usual case (no swapctl operations
860+
* occurred while the lock was releases - or
861+
* possibly a device was deleted and another
862+
* added - that's irrelevant, at this point
863+
* all that matters is the number of devices,
864+
* we haven't looked at the lists yet.
865+
*
866+
* So we never need to repeat this allocation,
867+
* Once is quite enough.
868+
*
869+
* And we never need to look at uvmexp.nswapdev
870+
* again!
871+
*/
872+
}
800873

801874
KASSERT(rw_lock_held(&swap_syscall_lock));
802875

876+
/*
877+
* Collect all of the swap descriptors, while
878+
* holding the data lock, so the lists cannot
879+
* change. Then they can be used safely.
880+
*
881+
* They cannot be deleted, because swap_syscall_lock
882+
* is held, but the lists holding them can be adjusted
883+
* except in this small loop where we lock out that
884+
* kind of activity. No processing happens here,
885+
* this is fast, with no func calls, or anything which
886+
* might perform operations which might need the lock.
887+
*/
888+
mutex_enter(&uvm_swap_data_lock);
889+
sp = sdps;
890+
count = 0;
803891
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
804892
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
805-
int inuse;
893+
if (++count <= slots)
894+
*sp++ = sdp;
895+
/*
896+
* don't bother with exiting the loops
897+
* early, they tend to be very short,
898+
* and not exhausting them is a very rare
899+
* occurrence. So just loop and do nothing
900+
* in the odd case we could break out early.
901+
*/
902+
}
903+
}
904+
mutex_exit(&uvm_swap_data_lock);
806905

807-
if (misc-- <= 0)
808-
break;
906+
/*
907+
* Now we have a stable list of devices which cannot change,
908+
* even if the swapping lists are reordered.
909+
*/
809910

810-
inuse = btodb((uint64_t)sdp->swd_npginuse <<
811-
PAGE_SHIFT);
812-
813-
memset(&sep, 0, sizeof(sep));
814-
swapent_cvt(&sep, sdp, inuse);
815-
if (f)
816-
(*f)(&sep, &sep);
817-
if ((error = copyout(&sep, ptr, len)) != 0)
818-
return error;
819-
ptr += len;
820-
count++;
821-
}
911+
if (misc > slots) /* the number of storage slots */
912+
misc = slots;
913+
if (misc > count) /* the number of devices now */
914+
misc = count;
915+
916+
error = 0;
917+
count = 0;
918+
sp = sdps;
919+
while (misc-- > 0) {
920+
int inuse;
921+
922+
sdp = *sp++;
923+
924+
inuse = btodb((uint64_t)sdp->swd_npginuse <<
925+
PAGE_SHIFT);
926+
927+
memset(&sep, 0, sizeof(sep));
928+
swapent_cvt(&sep, sdp, inuse);
929+
if (f)
930+
(*f)(&sep, &sep);
931+
if ((error = copyout(&sep, ptr, len)) != 0)
932+
goto out;
933+
ptr += len;
934+
count++;
822935
}
823936
*retval = count;
824-
return 0;
937+
out:;
938+
if (sdps != stackbuf) {
939+
/*
940+
* XXX should unlock & lock again here
941+
* as well probably, but for now, no...
942+
*/
943+
kmem_free(sdps, sdpsize);
944+
}
945+
return error;
825946
}
826947

827948
/*

0 commit comments

Comments
 (0)