Skip to content

Commit 4c6283e

Browse files
committed
Merge tag 'xfs-zoned-allocator-2025-03-03' of git://git.infradead.org/users/hch/xfs into xfs-6.15-zoned_devices
xfs: add support for zoned devices Add support for the new zoned space allocator and thus for zoned devices: https://zonedstorage.io/docs/introduction/zoned-storage to XFS. This has been developed for and tested on both SMR hard drives, which are the oldest and most common class of zoned devices: https://zonedstorage.io/docs/introduction/smr and ZNS SSDs: https://zonedstorage.io/docs/introduction/zns It has not been tested with zoned UFS devices, as their current capacity points and performance characteristics aren't too interesting for XFS use cases (but never say never). Sequential write only zones are only supported for data using a new allocator for the RT device, which maps each zone to a rtgroup which is written sequentially. All metadata and (for now) the log require using randomly writable space. This means a realtime device is required to support zoned storage, but for the common case of SMR hard drives that contain random writable zones and sequential write required zones on the same block device, the concept of an internal RT device is added which means using XFS on a SMR HDD is as simple as: $ mkfs.xfs /dev/sda $ mount /dev/sda /mnt When using NVMe ZNS SSDs that do not support conventional zones, the traditional multi-device RT configuration is required. E.g. for an SSD with a conventional namespace 1 and a zoned namespace 2: $ mkfs.xfs /dev/nvme0n1 -o rtdev=/dev/nvme0n2 $ mount -o rtdev=/dev/nvme0n2 /dev/nvme0n1 /mnt The zoned allocator can also be used on conventional block devices, or on conventional zones (e.g. when using an SMR HDD as the external RT device). For example using zoned XFS on normal SSDs shows very nice performance advantages and write amplification reduction for intelligent workloads like RocksDB. Some work is still in progress or planned, but should not affect the integration with the rest of XFS or the on-disk format: - support for quotas - support for reflinks Note that the I/O path already supports reflink, but garbage collection isn't refcount aware yet and would unshare shared blocks, thus rendering the feature useless.
2 parents 0a1fd78 + 9c47791 commit 4c6283e

75 files changed

Lines changed: 5649 additions & 937 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

fs/xfs/Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
6464
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
6565
xfs_rtbitmap.o \
6666
xfs_rtgroup.o \
67+
xfs_zones.o \
6768
)
6869

6970
# highlevel code
@@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
136137
xfs_quotaops.o
137138

138139
# xfs_rtbitmap is shared with libxfs
139-
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
140+
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
141+
xfs_zone_alloc.o \
142+
xfs_zone_gc.o \
143+
xfs_zone_info.o \
144+
xfs_zone_space_resv.o
140145

141146
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
142147
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o

fs/xfs/libxfs/xfs_bmap.c

Lines changed: 23 additions & 293 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@
3434
#include "xfs_ag.h"
3535
#include "xfs_ag_resv.h"
3636
#include "xfs_refcount.h"
37-
#include "xfs_icache.h"
3837
#include "xfs_iomap.h"
3938
#include "xfs_health.h"
4039
#include "xfs_bmap_item.h"
4140
#include "xfs_symlink_remote.h"
4241
#include "xfs_inode_util.h"
4342
#include "xfs_rtgroup.h"
43+
#include "xfs_zone_alloc.h"
4444

4545
struct kmem_cache *xfs_bmap_intent_cache;
4646

@@ -171,18 +171,16 @@ xfs_bmbt_update(
171171
* Compute the worst-case number of indirect blocks that will be used
172172
* for ip's delayed extent of length "len".
173173
*/
174-
STATIC xfs_filblks_t
174+
xfs_filblks_t
175175
xfs_bmap_worst_indlen(
176-
xfs_inode_t *ip, /* incore inode pointer */
177-
xfs_filblks_t len) /* delayed extent length */
176+
struct xfs_inode *ip, /* incore inode pointer */
177+
xfs_filblks_t len) /* delayed extent length */
178178
{
179-
int level; /* btree level number */
180-
int maxrecs; /* maximum record count at this level */
181-
xfs_mount_t *mp; /* mount structure */
182-
xfs_filblks_t rval; /* return value */
179+
struct xfs_mount *mp = ip->i_mount;
180+
int maxrecs = mp->m_bmap_dmxr[0];
181+
int level;
182+
xfs_filblks_t rval;
183183

184-
mp = ip->i_mount;
185-
maxrecs = mp->m_bmap_dmxr[0];
186184
for (level = 0, rval = 0;
187185
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
188186
level++) {
@@ -2571,146 +2569,6 @@ xfs_bmap_add_extent_unwritten_real(
25712569
#undef PREV
25722570
}
25732571

2574-
/*
2575-
* Convert a hole to a delayed allocation.
2576-
*/
2577-
STATIC void
2578-
xfs_bmap_add_extent_hole_delay(
2579-
xfs_inode_t *ip, /* incore inode pointer */
2580-
int whichfork,
2581-
struct xfs_iext_cursor *icur,
2582-
xfs_bmbt_irec_t *new) /* new data to add to file extents */
2583-
{
2584-
struct xfs_ifork *ifp; /* inode fork pointer */
2585-
xfs_bmbt_irec_t left; /* left neighbor extent entry */
2586-
xfs_filblks_t newlen=0; /* new indirect size */
2587-
xfs_filblks_t oldlen=0; /* old indirect size */
2588-
xfs_bmbt_irec_t right; /* right neighbor extent entry */
2589-
uint32_t state = xfs_bmap_fork_to_state(whichfork);
2590-
xfs_filblks_t temp; /* temp for indirect calculations */
2591-
2592-
ifp = xfs_ifork_ptr(ip, whichfork);
2593-
ASSERT(isnullstartblock(new->br_startblock));
2594-
2595-
/*
2596-
* Check and set flags if this segment has a left neighbor
2597-
*/
2598-
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
2599-
state |= BMAP_LEFT_VALID;
2600-
if (isnullstartblock(left.br_startblock))
2601-
state |= BMAP_LEFT_DELAY;
2602-
}
2603-
2604-
/*
2605-
* Check and set flags if the current (right) segment exists.
2606-
* If it doesn't exist, we're converting the hole at end-of-file.
2607-
*/
2608-
if (xfs_iext_get_extent(ifp, icur, &right)) {
2609-
state |= BMAP_RIGHT_VALID;
2610-
if (isnullstartblock(right.br_startblock))
2611-
state |= BMAP_RIGHT_DELAY;
2612-
}
2613-
2614-
/*
2615-
* Set contiguity flags on the left and right neighbors.
2616-
* Don't let extents get too large, even if the pieces are contiguous.
2617-
*/
2618-
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
2619-
left.br_startoff + left.br_blockcount == new->br_startoff &&
2620-
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
2621-
state |= BMAP_LEFT_CONTIG;
2622-
2623-
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
2624-
new->br_startoff + new->br_blockcount == right.br_startoff &&
2625-
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
2626-
(!(state & BMAP_LEFT_CONTIG) ||
2627-
(left.br_blockcount + new->br_blockcount +
2628-
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
2629-
state |= BMAP_RIGHT_CONTIG;
2630-
2631-
/*
2632-
* Switch out based on the contiguity flags.
2633-
*/
2634-
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
2635-
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2636-
/*
2637-
* New allocation is contiguous with delayed allocations
2638-
* on the left and on the right.
2639-
* Merge all three into a single extent record.
2640-
*/
2641-
temp = left.br_blockcount + new->br_blockcount +
2642-
right.br_blockcount;
2643-
2644-
oldlen = startblockval(left.br_startblock) +
2645-
startblockval(new->br_startblock) +
2646-
startblockval(right.br_startblock);
2647-
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2648-
oldlen);
2649-
left.br_startblock = nullstartblock(newlen);
2650-
left.br_blockcount = temp;
2651-
2652-
xfs_iext_remove(ip, icur, state);
2653-
xfs_iext_prev(ifp, icur);
2654-
xfs_iext_update_extent(ip, state, icur, &left);
2655-
break;
2656-
2657-
case BMAP_LEFT_CONTIG:
2658-
/*
2659-
* New allocation is contiguous with a delayed allocation
2660-
* on the left.
2661-
* Merge the new allocation with the left neighbor.
2662-
*/
2663-
temp = left.br_blockcount + new->br_blockcount;
2664-
2665-
oldlen = startblockval(left.br_startblock) +
2666-
startblockval(new->br_startblock);
2667-
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2668-
oldlen);
2669-
left.br_blockcount = temp;
2670-
left.br_startblock = nullstartblock(newlen);
2671-
2672-
xfs_iext_prev(ifp, icur);
2673-
xfs_iext_update_extent(ip, state, icur, &left);
2674-
break;
2675-
2676-
case BMAP_RIGHT_CONTIG:
2677-
/*
2678-
* New allocation is contiguous with a delayed allocation
2679-
* on the right.
2680-
* Merge the new allocation with the right neighbor.
2681-
*/
2682-
temp = new->br_blockcount + right.br_blockcount;
2683-
oldlen = startblockval(new->br_startblock) +
2684-
startblockval(right.br_startblock);
2685-
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2686-
oldlen);
2687-
right.br_startoff = new->br_startoff;
2688-
right.br_startblock = nullstartblock(newlen);
2689-
right.br_blockcount = temp;
2690-
xfs_iext_update_extent(ip, state, icur, &right);
2691-
break;
2692-
2693-
case 0:
2694-
/*
2695-
* New allocation is not contiguous with another
2696-
* delayed allocation.
2697-
* Insert a new entry.
2698-
*/
2699-
oldlen = newlen = 0;
2700-
xfs_iext_insert(ip, icur, new, state);
2701-
break;
2702-
}
2703-
if (oldlen != newlen) {
2704-
ASSERT(oldlen > newlen);
2705-
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
2706-
2707-
/*
2708-
* Nothing to do for disk quota accounting here.
2709-
*/
2710-
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
2711-
}
2712-
}
2713-
27142572
/*
27152573
* Convert a hole to a real allocation.
27162574
*/
@@ -4039,144 +3897,6 @@ xfs_bmapi_read(
40393897
return 0;
40403898
}
40413899

4042-
/*
4043-
* Add a delayed allocation extent to an inode. Blocks are reserved from the
4044-
* global pool and the extent inserted into the inode in-core extent tree.
4045-
*
4046-
* On entry, got refers to the first extent beyond the offset of the extent to
4047-
* allocate or eof is specified if no such extent exists. On return, got refers
4048-
* to the extent record that was inserted to the inode fork.
4049-
*
4050-
* Note that the allocated extent may have been merged with contiguous extents
4051-
* during insertion into the inode fork. Thus, got does not reflect the current
4052-
* state of the inode fork on return. If necessary, the caller can use lastx to
4053-
* look up the updated record in the inode fork.
4054-
*/
4055-
int
4056-
xfs_bmapi_reserve_delalloc(
4057-
struct xfs_inode *ip,
4058-
int whichfork,
4059-
xfs_fileoff_t off,
4060-
xfs_filblks_t len,
4061-
xfs_filblks_t prealloc,
4062-
struct xfs_bmbt_irec *got,
4063-
struct xfs_iext_cursor *icur,
4064-
int eof)
4065-
{
4066-
struct xfs_mount *mp = ip->i_mount;
4067-
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
4068-
xfs_extlen_t alen;
4069-
xfs_extlen_t indlen;
4070-
uint64_t fdblocks;
4071-
int error;
4072-
xfs_fileoff_t aoff;
4073-
bool use_cowextszhint =
4074-
whichfork == XFS_COW_FORK && !prealloc;
4075-
4076-
retry:
4077-
/*
4078-
* Cap the alloc length. Keep track of prealloc so we know whether to
4079-
* tag the inode before we return.
4080-
*/
4081-
aoff = off;
4082-
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
4083-
if (!eof)
4084-
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
4085-
if (prealloc && alen >= len)
4086-
prealloc = alen - len;
4087-
4088-
/*
4089-
* If we're targetting the COW fork but aren't creating a speculative
4090-
* posteof preallocation, try to expand the reservation to align with
4091-
* the COW extent size hint if there's sufficient free space.
4092-
*
4093-
* Unlike the data fork, the CoW cancellation functions will free all
4094-
* the reservations at inactivation, so we don't require that every
4095-
* delalloc reservation have a dirty pagecache.
4096-
*/
4097-
if (use_cowextszhint) {
4098-
struct xfs_bmbt_irec prev;
4099-
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
4100-
4101-
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
4102-
prev.br_startoff = NULLFILEOFF;
4103-
4104-
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
4105-
1, 0, &aoff, &alen);
4106-
ASSERT(!error);
4107-
}
4108-
4109-
/*
4110-
* Make a transaction-less quota reservation for delayed allocation
4111-
* blocks. This number gets adjusted later. We return if we haven't
4112-
* allocated blocks already inside this loop.
4113-
*/
4114-
error = xfs_quota_reserve_blkres(ip, alen);
4115-
if (error)
4116-
goto out;
4117-
4118-
/*
4119-
* Split changing sb for alen and indlen since they could be coming
4120-
* from different places.
4121-
*/
4122-
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
4123-
ASSERT(indlen > 0);
4124-
4125-
fdblocks = indlen;
4126-
if (XFS_IS_REALTIME_INODE(ip)) {
4127-
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
4128-
if (error)
4129-
goto out_unreserve_quota;
4130-
} else {
4131-
fdblocks += alen;
4132-
}
4133-
4134-
error = xfs_dec_fdblocks(mp, fdblocks, false);
4135-
if (error)
4136-
goto out_unreserve_frextents;
4137-
4138-
ip->i_delayed_blks += alen;
4139-
xfs_mod_delalloc(ip, alen, indlen);
4140-
4141-
got->br_startoff = aoff;
4142-
got->br_startblock = nullstartblock(indlen);
4143-
got->br_blockcount = alen;
4144-
got->br_state = XFS_EXT_NORM;
4145-
4146-
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
4147-
4148-
/*
4149-
* Tag the inode if blocks were preallocated. Note that COW fork
4150-
* preallocation can occur at the start or end of the extent, even when
4151-
* prealloc == 0, so we must also check the aligned offset and length.
4152-
*/
4153-
if (whichfork == XFS_DATA_FORK && prealloc)
4154-
xfs_inode_set_eofblocks_tag(ip);
4155-
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
4156-
xfs_inode_set_cowblocks_tag(ip);
4157-
4158-
return 0;
4159-
4160-
out_unreserve_frextents:
4161-
if (XFS_IS_REALTIME_INODE(ip))
4162-
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
4163-
out_unreserve_quota:
4164-
if (XFS_IS_QUOTA_ON(mp))
4165-
xfs_quota_unreserve_blkres(ip, alen);
4166-
out:
4167-
if (error == -ENOSPC || error == -EDQUOT) {
4168-
trace_xfs_delalloc_enospc(ip, off, len);
4169-
4170-
if (prealloc || use_cowextszhint) {
4171-
/* retry without any preallocation */
4172-
use_cowextszhint = false;
4173-
prealloc = 0;
4174-
goto retry;
4175-
}
4176-
}
4177-
return error;
4178-
}
4179-
41803900
static int
41813901
xfs_bmapi_allocate(
41823902
struct xfs_bmalloca *bma)
@@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay(
49484668
int whichfork,
49494669
struct xfs_iext_cursor *icur,
49504670
struct xfs_bmbt_irec *got,
4951-
struct xfs_bmbt_irec *del)
4671+
struct xfs_bmbt_irec *del,
4672+
uint32_t bflags) /* bmapi flags */
49524673
{
49534674
struct xfs_mount *mp = ip->i_mount;
49544675
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay(
50684789
da_diff = da_old - da_new;
50694790
fdblocks = da_diff;
50704791

5071-
if (isrt)
5072-
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
5073-
else
4792+
if (bflags & XFS_BMAPI_REMAP) {
4793+
;
4794+
} else if (isrt) {
4795+
xfs_rtbxlen_t rtxlen;
4796+
4797+
rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
4798+
if (xfs_is_zoned_inode(ip))
4799+
xfs_zoned_add_available(mp, rtxlen);
4800+
xfs_add_frextents(mp, rtxlen);
4801+
} else {
50744802
fdblocks += del->br_blockcount;
4803+
}
50754804

50764805
xfs_add_fdblocks(mp, fdblocks);
50774806
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -5670,7 +5399,8 @@ __xfs_bunmapi(
56705399

56715400
delete:
56725401
if (wasdel) {
5673-
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
5402+
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
5403+
&del, flags);
56745404
} else {
56755405
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
56765406
&del, &tmp_logflags, whichfork,

0 commit comments

Comments
 (0)