Skip to content

Commit 64d0361

Browse files
yhrChristoph Hellwig
authored andcommitted
xfs: support write life time based data placement
Add a file write life time data placement allocation scheme that aims to minimize fragmentation and thereby to do two things: a) separate file data to different zones when possible. b) colocate file data of similar life times when feasible. To get best results, average file sizes should align with the zone capacity that is reported through the XFS_IOC_FSGEOMETRY ioctl. This improvement in data placement efficiency reduces the number of blocks requiring relocation by GC, and thus decreases overall write amplification. The impact on performance varies depending on how full the file system is. For RocksDB using leveled compaction, the lifetime hints can improve throughput for overwrite workloads at 80% file system utilization by ~10%, but for lower file system utilization there won't be as much benefit in application performance as there is less need for garbage collection to start with. Lifetime hints can be disabled using the nolifetime mount option. Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
1 parent 7452a6d commit 64d0361

5 files changed

Lines changed: 141 additions & 19 deletions

File tree

fs/xfs/xfs_mount.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ typedef struct xfs_mount {
373373
#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
374374

375375
/* Mount features */
376+
#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
376377
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
377378
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
378379
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
@@ -428,6 +429,7 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
428429
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
429430
__XFS_HAS_FEAT(metadir, METADIR)
430431
__XFS_HAS_FEAT(zoned, ZONED)
432+
__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
431433

432434
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
433435
{

fs/xfs/xfs_super.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ enum {
111111
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
112112
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
113113
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
114+
Opt_lifetime, Opt_nolifetime,
114115
};
115116

116117
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -156,6 +157,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
156157
fsparam_flag("dax", Opt_dax),
157158
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
158159
fsparam_u32("max_open_zones", Opt_max_open_zones),
160+
fsparam_flag("lifetime", Opt_lifetime),
161+
fsparam_flag("nolifetime", Opt_nolifetime),
159162
{}
160163
};
161164

@@ -184,6 +187,7 @@ xfs_fs_show_options(
184187
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
185188
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
186189
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
190+
{ XFS_FEAT_NOLIFETIME, ",nolifetime" },
187191
{ 0, NULL }
188192
};
189193
struct xfs_mount *mp = XFS_M(root->d_sb);
@@ -1091,6 +1095,11 @@ xfs_finish_flags(
10911095
"max_open_zones mount option only supported on zoned file systems.");
10921096
return -EINVAL;
10931097
}
1098+
if (mp->m_features & XFS_FEAT_NOLIFETIME) {
1099+
xfs_warn(mp,
1100+
"nolifetime mount option only supported on zoned file systems.");
1101+
return -EINVAL;
1102+
}
10941103
}
10951104

10961105
return 0;
@@ -1478,6 +1487,12 @@ xfs_fs_parse_param(
14781487
case Opt_max_open_zones:
14791488
parsing_mp->m_max_open_zones = result.uint_32;
14801489
return 0;
1490+
case Opt_lifetime:
1491+
parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
1492+
return 0;
1493+
case Opt_nolifetime:
1494+
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
1495+
return 0;
14811496
default:
14821497
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
14831498
return -EINVAL;

fs/xfs/xfs_zone_alloc.c

Lines changed: 114 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ static struct xfs_open_zone *
424424
xfs_init_open_zone(
425425
struct xfs_rtgroup *rtg,
426426
xfs_rgblock_t write_pointer,
427+
enum rw_hint write_hint,
427428
bool is_gc)
428429
{
429430
struct xfs_open_zone *oz;
@@ -434,6 +435,7 @@ xfs_init_open_zone(
434435
oz->oz_rtg = rtg;
435436
oz->oz_write_pointer = write_pointer;
436437
oz->oz_written = write_pointer;
438+
oz->oz_write_hint = write_hint;
437439
oz->oz_is_gc = is_gc;
438440

439441
/*
@@ -453,6 +455,7 @@ xfs_init_open_zone(
453455
struct xfs_open_zone *
454456
xfs_open_zone(
455457
struct xfs_mount *mp,
458+
enum rw_hint write_hint,
456459
bool is_gc)
457460
{
458461
struct xfs_zone_info *zi = mp->m_zone_info;
@@ -465,12 +468,13 @@ xfs_open_zone(
465468
return NULL;
466469

467470
set_current_state(TASK_RUNNING);
468-
return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
471+
return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc);
469472
}
470473

471474
static struct xfs_open_zone *
472475
xfs_try_open_zone(
473-
struct xfs_mount *mp)
476+
struct xfs_mount *mp,
477+
enum rw_hint write_hint)
474478
{
475479
struct xfs_zone_info *zi = mp->m_zone_info;
476480
struct xfs_open_zone *oz;
@@ -487,7 +491,7 @@ xfs_try_open_zone(
487491
*/
488492
zi->zi_nr_open_zones++;
489493
spin_unlock(&zi->zi_open_zones_lock);
490-
oz = xfs_open_zone(mp, false);
494+
oz = xfs_open_zone(mp, write_hint, false);
491495
spin_lock(&zi->zi_open_zones_lock);
492496
if (!oz) {
493497
zi->zi_nr_open_zones--;
@@ -510,16 +514,78 @@ xfs_try_open_zone(
510514
return oz;
511515
}
512516

517+
/*
518+
* For data with short or medium lifetime, try to colocated it into an
519+
* already open zone with a matching temperature.
520+
*/
521+
static bool
522+
xfs_colocate_eagerly(
523+
enum rw_hint file_hint)
524+
{
525+
switch (file_hint) {
526+
case WRITE_LIFE_MEDIUM:
527+
case WRITE_LIFE_SHORT:
528+
case WRITE_LIFE_NONE:
529+
return true;
530+
default:
531+
return false;
532+
}
533+
}
534+
535+
static bool
536+
xfs_good_hint_match(
537+
struct xfs_open_zone *oz,
538+
enum rw_hint file_hint)
539+
{
540+
switch (oz->oz_write_hint) {
541+
case WRITE_LIFE_LONG:
542+
case WRITE_LIFE_EXTREME:
543+
/* colocate long and extreme */
544+
if (file_hint == WRITE_LIFE_LONG ||
545+
file_hint == WRITE_LIFE_EXTREME)
546+
return true;
547+
break;
548+
case WRITE_LIFE_MEDIUM:
549+
/* colocate medium with medium */
550+
if (file_hint == WRITE_LIFE_MEDIUM)
551+
return true;
552+
break;
553+
case WRITE_LIFE_SHORT:
554+
case WRITE_LIFE_NONE:
555+
case WRITE_LIFE_NOT_SET:
556+
/* colocate short and none */
557+
if (file_hint <= WRITE_LIFE_SHORT)
558+
return true;
559+
break;
560+
}
561+
return false;
562+
}
563+
513564
static bool
514565
xfs_try_use_zone(
515566
struct xfs_zone_info *zi,
516-
struct xfs_open_zone *oz)
567+
enum rw_hint file_hint,
568+
struct xfs_open_zone *oz,
569+
bool lowspace)
517570
{
518571
if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
519572
return false;
573+
if (!lowspace && !xfs_good_hint_match(oz, file_hint))
574+
return false;
520575
if (!atomic_inc_not_zero(&oz->oz_ref))
521576
return false;
522577

578+
/*
579+
* If we have a hint set for the data, use that for the zone even if
580+
* some data was written already without any hint set, but don't change
581+
* the temperature after that as that would make little sense without
582+
* tracking per-temperature class written block counts, which is
583+
* probably overkill anyway.
584+
*/
585+
if (file_hint != WRITE_LIFE_NOT_SET &&
586+
oz->oz_write_hint == WRITE_LIFE_NOT_SET)
587+
oz->oz_write_hint = file_hint;
588+
523589
/*
524590
* If we couldn't match by inode or life time we just pick the first
525591
* zone with enough space above. For that we want the least busy zone
@@ -534,14 +600,16 @@ xfs_try_use_zone(
534600

535601
static struct xfs_open_zone *
536602
xfs_select_open_zone_lru(
537-
struct xfs_zone_info *zi)
603+
struct xfs_zone_info *zi,
604+
enum rw_hint file_hint,
605+
bool lowspace)
538606
{
539607
struct xfs_open_zone *oz;
540608

541609
lockdep_assert_held(&zi->zi_open_zones_lock);
542610

543611
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
544-
if (xfs_try_use_zone(zi, oz))
612+
if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
545613
return oz;
546614

547615
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -550,20 +618,28 @@ xfs_select_open_zone_lru(
550618

551619
static struct xfs_open_zone *
552620
xfs_select_open_zone_mru(
553-
struct xfs_zone_info *zi)
621+
struct xfs_zone_info *zi,
622+
enum rw_hint file_hint)
554623
{
555624
struct xfs_open_zone *oz;
556625

557626
lockdep_assert_held(&zi->zi_open_zones_lock);
558627

559628
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
560-
if (xfs_try_use_zone(zi, oz))
629+
if (xfs_try_use_zone(zi, file_hint, oz, false))
561630
return oz;
562631

563632
cond_resched_lock(&zi->zi_open_zones_lock);
564633
return NULL;
565634
}
566635

636+
static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
637+
{
638+
if (xfs_has_nolifetime(ip->i_mount))
639+
return WRITE_LIFE_NOT_SET;
640+
return VFS_I(ip)->i_write_hint;
641+
}
642+
567643
/*
568644
* Try to pack inodes that are written back after they were closed tight instead
569645
* of trying to open new zones for them or spread them to the least recently
@@ -587,6 +663,7 @@ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
587663
static struct xfs_open_zone *
588664
xfs_select_zone_nowait(
589665
struct xfs_mount *mp,
666+
enum rw_hint write_hint,
590667
bool pack_tight)
591668
{
592669
struct xfs_zone_info *zi = mp->m_zone_info;
@@ -595,20 +672,38 @@ xfs_select_zone_nowait(
595672
if (xfs_is_shutdown(mp))
596673
return NULL;
597674

675+
/*
676+
* Try to fill up open zones with matching temperature if available. It
677+
* is better to try to co-locate data when this is favorable, so we can
678+
* activate empty zones when it is statistically better to separate
679+
* data.
680+
*/
598681
spin_lock(&zi->zi_open_zones_lock);
599-
if (pack_tight)
600-
oz = xfs_select_open_zone_mru(zi);
682+
if (xfs_colocate_eagerly(write_hint))
683+
oz = xfs_select_open_zone_lru(zi, write_hint, false);
684+
else if (pack_tight)
685+
oz = xfs_select_open_zone_mru(zi, write_hint);
601686
if (oz)
602687
goto out_unlock;
603688

604689
/*
605690
* See if we can open a new zone and use that.
606691
*/
607-
oz = xfs_try_open_zone(mp);
692+
oz = xfs_try_open_zone(mp, write_hint);
608693
if (oz)
609694
goto out_unlock;
610695

611-
oz = xfs_select_open_zone_lru(zi);
696+
/*
697+
* Try to colocate cold data with other cold data if we failed to open a
698+
* new zone for it.
699+
*/
700+
if (write_hint != WRITE_LIFE_NOT_SET &&
701+
!xfs_colocate_eagerly(write_hint))
702+
oz = xfs_select_open_zone_lru(zi, write_hint, false);
703+
if (!oz)
704+
oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
705+
if (!oz)
706+
oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
612707
out_unlock:
613708
spin_unlock(&zi->zi_open_zones_lock);
614709
return oz;
@@ -617,19 +712,20 @@ xfs_select_zone_nowait(
617712
static struct xfs_open_zone *
618713
xfs_select_zone(
619714
struct xfs_mount *mp,
715+
enum rw_hint write_hint,
620716
bool pack_tight)
621717
{
622718
struct xfs_zone_info *zi = mp->m_zone_info;
623719
DEFINE_WAIT (wait);
624720
struct xfs_open_zone *oz;
625721

626-
oz = xfs_select_zone_nowait(mp, pack_tight);
722+
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
627723
if (oz)
628724
return oz;
629725

630726
for (;;) {
631727
prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
632-
oz = xfs_select_zone_nowait(mp, pack_tight);
728+
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
633729
if (oz)
634730
break;
635731
schedule();
@@ -707,6 +803,7 @@ xfs_zone_alloc_and_submit(
707803
{
708804
struct xfs_inode *ip = XFS_I(ioend->io_inode);
709805
struct xfs_mount *mp = ip->i_mount;
806+
enum rw_hint write_hint = xfs_inode_write_hint(ip);
710807
bool pack_tight = xfs_zoned_pack_tight(ip);
711808
unsigned int alloc_len;
712809
struct iomap_ioend *split;
@@ -724,7 +821,7 @@ xfs_zone_alloc_and_submit(
724821
*oz = xfs_last_used_zone(ioend);
725822
if (!*oz) {
726823
select_zone:
727-
*oz = xfs_select_zone(mp, pack_tight);
824+
*oz = xfs_select_zone(mp, write_hint, pack_tight);
728825
if (!*oz)
729826
goto out_error;
730827
}
@@ -862,7 +959,8 @@ xfs_init_zone(
862959
struct xfs_open_zone *oz;
863960

864961
atomic_inc(&rtg_group(rtg)->xg_active_ref);
865-
oz = xfs_init_open_zone(rtg, write_pointer, false);
962+
oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET,
963+
false);
866964
list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
867965
zi->zi_nr_open_zones++;
868966

fs/xfs/xfs_zone_gc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ xfs_zone_gc_select_target(
547547

548548
ASSERT(zi->zi_nr_open_zones <=
549549
mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
550-
oz = xfs_open_zone(mp, true);
550+
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
551551
if (oz)
552552
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
553553
spin_lock(&zi->zi_open_zones_lock);
@@ -1117,7 +1117,7 @@ xfs_zone_gc_mount(
11171117
zi->zi_nr_open_zones == mp->m_max_open_zones)
11181118
oz = xfs_zone_gc_steal_open(zi);
11191119
else
1120-
oz = xfs_open_zone(mp, true);
1120+
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
11211121
if (!oz) {
11221122
xfs_warn(mp, "unable to allocate a zone for gc");
11231123
error = -EIO;

fs/xfs/xfs_zone_priv.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ struct xfs_open_zone {
2626
*/
2727
xfs_rgblock_t oz_written;
2828

29+
/*
30+
* Write hint (data temperature) assigned to this zone, or
31+
* WRITE_LIFE_NOT_SET if none was set.
32+
*/
33+
enum rw_hint oz_write_hint;
34+
2935
/*
3036
* Is this open zone used for garbage collection? There can only be a
3137
* single open GC zone, which is pointed to by zi_open_gc_zone in
@@ -100,7 +106,8 @@ struct xfs_zone_info {
100106

101107
};
102108

103-
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
109+
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
110+
enum rw_hint write_hint, bool is_gc);
104111

105112
int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
106113
bool xfs_zoned_need_gc(struct xfs_mount *mp);

0 commit comments

Comments
 (0)