Implement RFC 3943: MIR move elimination#157943
Conversation
|
|
This comment has been minimized.
This comment has been minimized.
f12157e to
ca2c2ac
Compare
| tcx: TyCtxt<'tcx>, | ||
| body: &mir::Body<'tcx>, | ||
| pass_name: Option<&'static str>, | ||
| ) -> (Vec<(Local, Location)>, IndexVec<BasicBlock, DenseBitSet<Local>>) { |
There was a problem hiding this comment.
BitMatrix<BasicBlock, Local>?
There was a problem hiding this comment.
In PreciseLiveness::apply_block_start_effect we need to have a DenseBitSet<Local> to call .intersect on, which BitMatrix doesn't expose.
| // Notably this kills any dead results produced by a predecessor's terminator. | ||
| state.intersect(&live_on_entry[block]); | ||
|
|
||
| for local in state.iter() { | ||
| builder.gen_(local, points.entry_point(block), SplitPointEffect::Early); | ||
| } | ||
|
|
||
| for (statement_index, statement) in block_data.statements.iter().enumerate() { | ||
| let location = Location { block, statement_index }; | ||
| let point = points.point_from_location(location); | ||
|
|
||
| // StorageDead always kills a local, even if it has been borrowed. | ||
| if let mir::StatementKind::StorageDead(local) = statement.kind { | ||
| builder.kill(local, point, SplitPointEffect::Late); | ||
| continue; | ||
| } | ||
|
|
||
| // Kill moved operands if the whole local was moved. | ||
| VisitPlacesWith(|place: Place<'tcx>, ctxt| { | ||
| if ctxt == PlaceContext::NonMutatingUse(NonMutatingUseContext::Move) { | ||
| if let Some(local) = place.as_local() { | ||
| builder.kill(local, point, SplitPointEffect::Early); | ||
| } | ||
| } | ||
| }) | ||
| .visit_statement(statement, location); | ||
|
|
||
| // Kill any locals which are no longer used after this statement. | ||
| for &(local, _) in kill_point_map[point] { | ||
| builder.kill(local, point, SplitPointEffect::Early); | ||
| } | ||
|
|
||
| // Gen destination places. | ||
| VisitPlacesWith(|place: Place<'tcx>, ctxt| match DefUse::for_place(place, ctxt) { | ||
| DefUse::Def | DefUse::PartialWrite => { | ||
| builder.gen_(place.local, point, SplitPointEffect::Late) | ||
| } | ||
| DefUse::Use | DefUse::NonUse => {} | ||
| }) | ||
| .visit_statement(statement, location); | ||
|
|
||
| // Kill any dead destination places: they will only appear at | ||
| // the late point of the statement they are generated in, which is | ||
| // sufficient for determining overlap. | ||
| for &(local, _) in kill_point_map[point] { | ||
| builder.kill(local, point, SplitPointEffect::Late); | ||
| } | ||
| } | ||
|
|
||
| let location = Location { block, statement_index: block_data.statements.len() }; | ||
| let point = points.point_from_location(location); | ||
| let terminator = block_data.terminator(); | ||
|
|
||
| // Kill moved operands if the whole local was moved. Also kill dropped | ||
| // places if the entire local was dropped. | ||
| VisitPlacesWith(|place: Place<'tcx>, ctxt| { | ||
| if let PlaceContext::NonMutatingUse(NonMutatingUseContext::Move) | ||
| | PlaceContext::MutatingUse(MutatingUseContext::Drop) = ctxt | ||
| { | ||
| if let Some(local) = place.as_local() { | ||
| builder.kill(local, point, SplitPointEffect::Early); | ||
| } | ||
| } | ||
| }) | ||
| .visit_terminator(terminator, location); | ||
|
|
||
| // Kill any locals which are no longer used after this terminator. | ||
| for &(local, _) in kill_point_map[point] { | ||
| builder.kill(local, point, SplitPointEffect::Early); | ||
| } | ||
|
|
||
| // Gen destination places. | ||
| VisitPlacesWith(|place: Place<'tcx>, ctxt| match DefUse::for_place(place, ctxt) { | ||
| DefUse::Def | DefUse::PartialWrite => { | ||
| builder.gen_(place.local, point, SplitPointEffect::Late) | ||
| } | ||
| DefUse::Use | DefUse::NonUse => {} | ||
| }) | ||
| .visit_terminator(terminator, location); | ||
|
|
||
| // Move arguments to a call are treated specially: the place that they | ||
| // represent is passed directly to the callee, which means that they are | ||
| // not allowed to alias any other move operand or the destination place. | ||
| // This is represented here by extending their live range to the late | ||
| // part, making it overlap with that of the destination place. | ||
| // | ||
| // Notably, this *doesn't* apply to TailCall. | ||
| if let mir::TerminatorKind::Call { | ||
| func: _, | ||
| args, | ||
| destination: _, | ||
| target: _, | ||
| unwind: _, | ||
| call_source: _, | ||
| fn_span: _, | ||
| } = &terminator.kind | ||
| { | ||
| for arg in args { | ||
| if let mir::Operand::Move(place) = arg.node { | ||
| builder.gen_(place.local, point, SplitPointEffect::Late); | ||
| builder.kill(place.local, point, SplitPointEffect::Late); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
This duplicates a lot of code in impl Analysis for PreciseLiveness. Any way to deduplicate?
There was a problem hiding this comment.
Not easily: the two are sufficiently different that I'm not sure the code would be clearer if we tried to deduplicate. The PreciseLiveness analysis is simpler since it only needs to determine the liveness at the end of the statement. On the other hand matrix construction needs to determine the liveness both at the mid point of the instruction and after it, and record both in the matrix.
| trace!("cannot unify {a:?} and {b:?} involving a rust-call tuple argument"); | ||
| return None; | ||
| } | ||
|
|
There was a problem hiding this comment.
Can we remove a common projection tail from a and b before this match? Allowing to merge _1.foo.bar with _2.blah.foo.bar?
There was a problem hiding this comment.
We could, but that's not really useful in practice if you consider the kind of patterns that trigger this optimization. Specifically, in order for the lifetimes to not overlap the RHS of an assignment needs to end its lifetime at the assignment. This only happens for a move of a whole local (no projections) or for non-borrowed locals where this is its last use.
| fn visit_aggregate_assign( | ||
| &mut self, | ||
| dest: Place<'tcx>, | ||
| project_field: impl Fn(TyCtxt<'tcx>, Place<'tcx>, FieldIdx, Ty<'tcx>) -> Place<'tcx>, |
There was a problem hiding this comment.
Should the callback return a &[PlaceElem<'tcx>] and call project_deeper here?
There was a problem hiding this comment.
Returning a slice is awkward because there's no good lifetime to give it.
| // Under the local lifetime semantics from RFC 3943, `StorageLive` does not allocate, | ||
| // and `StorageDead` has no effect if the local was already freed by a move. These | ||
| // markers therefore do not affect whether a copy can be treated as a final use. | ||
| StatementKind::StorageLive(_) | StatementKind::StorageDead(_) | StatementKind::Nop => {} |
There was a problem hiding this comment.
Why can't StorageDead(l) perform a used_after.remove(l)?
There was a problem hiding this comment.
This is a backwards analysis where used_after tracks any local that is used later on the current path. We only ever add to used_after as we walk backwards through the function, and never remove from it.
This optimization is meant to supersede Separately, I think there may be value in having the coroutine pass re-use |
- Changed `iter_intervals` to return `RangeInclusive` instead of `Range` - Added `clear_row`, `disjoint_rows` and `intersects_range` methods
6724dd7 to
974d4c0
Compare
|
This PR was rebased onto a different main commit. Here's a range-diff highlighting what actually changed. Rebasing is a normal part of keeping PRs up to date, so no action is needed—this note is just to help reviewers. |
This flag also has the effect of disabling DestinationPropagation, which is already covered by move elimination.
974d4c0 to
34776e8
Compare
This comment has been minimized.
This comment has been minimized.
34776e8 to
454a459
Compare
| VisitPlacesWith(|place: Place<'tcx>, ctxt| { | ||
| if let PlaceContext::NonMutatingUse(NonMutatingUseContext::Move) = ctxt { | ||
| if let Some(local) = place.as_local() { | ||
| builder.kill(local, point, SplitPointEffect::Early); |
There was a problem hiding this comment.
Question: Should we just treat the call arguments as live and ignore gen and kill for them? I don't see how that helps with move elimination, though I know it makes the analysis a bit less precise.
There was a problem hiding this comment.
Are you talking about argument locals (_1, _2) that come function arguments? We do want to track lifetimes for those because we can merge them with other locals if their lifetimes don't overlap.
There was a problem hiding this comment.
Yes. Here is an example: f98897f which skips gen&kill for arguments. I don't see tests that are broken. I cannot imagine that a case can use the move fact. Or can we only kill in the late phase?
There was a problem hiding this comment.
We still need to kill moved arguments after the call. It's just that they are killed at the late point instead of the early point to model the fact that they are not allowed to overlap with the return place.
There was a problem hiding this comment.
I think it's fine to skip kills, which is an over-approximation, still has the fact. IIUC, the current code is:
// Early: Kill(_1)
_2 = Call(move _1) -> BB1, BB2
// Late: Gen(_1, _2), Kill(_1)
I mean change this to:
// Early: -
_2 = Call(move _1) -> BB1, BB2
// Late: Gen(_2)
Or can we change to:
// Early: -
_2 = Call(move _1) -> BB1, BB2
// Late: Gen(_2), Kill(_1)
or
// Early: -
_2 = Call(move _1) -> BB1, BB2
// Late: Gen(_2)
BB1:
Kill(_1)
BB2:
Kill(_1)
?
| if place.is_indirect_first_projection() { | ||
| return ControlFlow::Break(()); | ||
| } |
There was a problem hiding this comment.
Could you add a test case for this code path?
| // These permit either cannot have aliasing, or allow it because | ||
| // they only operate on scalar backend types. |
There was a problem hiding this comment.
stray word: permit
| // Accessing an indirect place may touch any borrowed local, so | ||
| // continuing would require treating all borrowed locals as used | ||
| // after this point. | ||
| if place.is_indirect_first_projection() { | ||
| return ControlFlow::Break(()); | ||
| } | ||
|
|
||
| // Writing to a borrowed local can start a new allocation range. | ||
| // Shortening an earlier borrowed local could remove an overlap | ||
| // with that new range. | ||
| if borrowed.contains(place.local) { | ||
| return ControlFlow::Break(()); | ||
| } | ||
|
|
||
| // `SetDiscriminant` has a validity invariant on the rest of the | ||
| // place, so treat the base local as accessed along with any | ||
| // projection locals. | ||
| record_place_locals(**place, used_after); |
There was a problem hiding this comment.
Could you add test cases for those conditions and call to record_place_locals (or alternatively share the implementation and tests with StatementKind::Assign)?
| // Indirect places don't overlap because we assume they didn't overlap in | ||
| // the input MIR. | ||
| if a.local != b.local || a.is_indirect_first_projection() || b.is_indirect_first_projection() { | ||
| return false; | ||
| } |
There was a problem hiding this comment.
Why do we know that there are no overlaps involving indirect places after transformation (borrowed locals do participate in the unification after all)?
There was a problem hiding this comment.
The reasoning is actually quite complex so I'll add a code comment explaining this. Essentially:
- An indirect destination place cannot start a lifetime so the lifetime must have started earlier, which overlaps the early point of the assignment statement. Therefore we couldn't have unified any source operand with this destination place.
- Similarly, an indirect source cannot end a lifetime and the same logic applies.
However there is a possible shape that could result in an indirect source overlapping a destination:
_2 = &_1
_3 = (copy *_2, move _1)
Looking at the code, this doesn't seem to be properly handled at the moment. It never came up because we never generate this kind of MIR. For the fix, I think we can just forcibly isolate any indirect source if any direct source overlaps the destination.
| /// Don't insert storage statements in cleanup blocks and in unreachable blocks. | ||
| fn should_insert_storage<'tcx>(block_data: &BasicBlockData<'tcx>) -> bool { | ||
| !block_data.is_cleanup && !matches!(block_data.terminator().kind, TerminatorKind::Unreachable) | ||
| } |
There was a problem hiding this comment.
It would be valuable to describe the motivation behind the decision to skip the storage statements there.
That being said, it seems incorrect in this form. Suppose there are two disjoint live ranges: one in a non-cleanup block and another in a cleanup block. We would insert storage statements for the first range, but not the second. Introducing UB for any use of a local in the second live range.
There was a problem hiding this comment.
You are right that it is incorrect for arbitrary MIR. Omitting StorageDead is not a problem, it just causes storage to live longer. The problem is omitting StorageLive.
It works fine for the MIR that we generate because our cleanup blocks only contain drop terminators and SwitchInt checks on drop flags. We never start the lifetime of any local in cleanup blocks.
Unfortunately, this is what happens if we don't skip inserting StorageLive in cleanup blocks. This happens because, upstream of us, MIR building doesn't emit StorageDead in cleanup blocks (#149435, #147875). If a local is ever borrowed, then the liveness analysis will show it as being live in all cleanup blocks all the way to the UnwindResume terminator. With proper StorageDead from MIR building this lifetime would end in the cleanup block at the end of its scope instead.
The end result is that we end up emitting O(N^2) StorageLive statements, specifically at points in the cleanup tree where control flow goes from an inner scope cleanup to an outer scope cleanup. The storage reconstruction algorithm inserts StorageLive at transitions from Dead->MaybeLive, which is the case for a local being live in an outer scope clean but not in an inner scope cleanup.
I'm not entirely sure what the best way to resolve this is. I'm OK with the current hack for now, at least until MIR building is fixed (but that requires an edition bump because it breaks borrowck).
There was a problem hiding this comment.
Do I understand correctly that we would be inserting StorageLive into cleanup blocks only because it is live in some predecessors and dead in others, and locals are never actually allocated in cleanup blocks?
In this specific case we could check whether a local is ever allocated in any cleanup block, right?
In general, it might be desirable to avoid inserting such useless storage statements, by identifying whether particular location might be followed by an allocation of a local.
There was a problem hiding this comment.
Right, basically given this MIR, here's where we current insert StorageLive:
let mut _1: (u32, 32);
bb0: {
switchInt(..) -> [bb1, bb2];
}
bb1: {
StorageLive(_1);
_1.0 = 1;
goto -> bb3;
}
bb2: {
StorageLive(_1);
goto -> bb3;
}
bb3: {
_1.1 = 1;
StorageDead(_1);
return;
}
There are 3 states that a local can be in for the purpose of the lifetime analysis: definitely-live, maybe-live and dead. In this example, we have the following transitions:
- _1 is entirely dead in bb0 and bb2.
- In bb1, _1 goes from dead to definitely-live.
- In bb3, _1 is maybe-live because it has different liveness states from predecessors.
We currently insert StorageLive at transition points from dead to live/maybe-live.
If we had a version of StorageLive that didn't clobber the contents of the local then we could just insert StorageLive at any transitions to definitely-live and avoid the problem entirely:
let mut _1: (u32, 32);
bb0: {
switchInt(..) -> [bb1, bb2];
}
bb1: {
StorageLive(_1);
_1.0 = 1;
goto -> bb3;
}
bb2: {
goto -> bb3;
}
bb3: {
StorageLive(_1); // This relies on StorageLive not clobbering _1
_1.1 = 1;
StorageDead(_1);
return;
}
Now what is happening in cleanup blocks is effectively what you'd get here if you removed the assignment to _1.1 in bb3. Now the StorageLive in bb2 becomes unnecessary, but there's no way we can know this from just looking at the set of live ranges produced by PreciseLiveness (which just indicate the ranges where a local is maybe-live).
So the rule for proper StorageLive placement would be something like:
- Always insert at dead -> definitely-live transitions. Such transitions only happens at assignments when the local is dead before the assignment.
- Insert at dead -> maybe-live transitions (which can only happen on block edges) only if later on that path there is an assignment which (partially) initializes that local.
I think this might be doable with another backward dataflow pass that tracks future assignments to a local, but I'll need to think about it more carefully.
There was a problem hiding this comment.
It works fine for the MIR that we generate because our cleanup blocks only contain drop terminators and SwitchInt checks on drop flags. We never start the lifetime of any local in cleanup blocks.
BTW, the conclusion doesn't follow since precise liveness can improve existing storage ranges, introducing live ranges that do start in cleanup blocks. For example, it does miscompile the following example by using _4 when it is dead in block bb4:
struct MaybePanic(bool);
impl MaybePanic {
fn new(panic: bool) -> MaybePanic {
MaybePanic(panic)
}
}
impl Drop for MaybePanic {
fn drop(&mut self) {
if self.0 {
panic!();
}
}
}
pub fn main() {
let mut a;
a = MaybePanic::new(true);
a = MaybePanic::new(false);
}rustc +stage1 b.rs -O -Zinline-mir=no -Zmir-move-elimination -Zdump-mir=all
--- b.main.3-2-030.MoveElimination.before.mir 2026-07-02 21:29:08.301724379 +0200
+++ b.main.3-2-030.MoveElimination.after.mir 2026-07-02 21:29:08.301912013 +0200
@@ -1,60 +1,68 @@
-// MIR for `main` before MoveElimination
+// MIR for `main` after MoveElimination
fn main() -> () {
let mut _0: ();
let mut _1: MaybePanic;
let mut _2: MaybePanic;
let mut _3: MaybePanic;
let mut _4: bool;
scope 1 {
debug a => _1;
}
bb0: {
+ StorageLive(_4);
_4 = const false;
+ nop;
+ nop;
StorageLive(_1);
- StorageLive(_2);
- _2 = MaybePanic::new(const true) -> [return: bb1, unwind: bb8];
+ _1 = MaybePanic::new(const true) -> [return: bb1, unwind: bb8];
}
bb1: {
+ StorageDead(_4);
+ StorageLive(_4);
_4 = const true;
- _1 = move _2;
- StorageDead(_2);
+ nop;
+ nop;
+ nop;
StorageLive(_3);
_3 = MaybePanic::new(const false) -> [return: bb2, unwind: bb8];
}
bb2: {
+ StorageDead(_4);
drop(_1) -> [return: bb3, unwind: bb4];
}
bb3: {
_1 = move _3;
StorageDead(_3);
+ nop;
drop(_1) -> [return: bb5, unwind continue];
}
bb4 (cleanup): {
_4 = const true;
_1 = move _3;
goto -> bb8;
}
bb5: {
+ nop;
StorageDead(_1);
return;
}
bb6 (cleanup): {
resume;
}
bb7 (cleanup): {
drop(_1) -> [return: bb6, unwind terminate(cleanup)];
}
bb8 (cleanup): {
switchInt(copy _4) -> [0: bb6, otherwise: bb7];
}
}| } | ||
|
|
||
| /// Re-constructs storage statements for all locals. | ||
| fn reconstruct_storage<'tcx>( |
There was a problem hiding this comment.
A few quality-of-implementation observations on storage statements reconstruction (no need to address any of them here):
- A storage range can start without being ended before return, e.g., when a local without any storage statements is borrowed.
emit_storage_live_in_predsinserts duplicatedStorageLivewhen there are multiple edges between a block and its predecessor.emit_storage_dead_in_succsinserts duplicatedStorageDeadwhen there are multiple edges between a block and its successor.- There are incidental overlaps between storage ranges of distinct locals when inserting statements at the same location (we would like to order
StorageDeadbeforeStorageLive, while it happens to be the order of calls toMirPatch::add_statement).
There was a problem hiding this comment.
- We could fix this in the lifetime analysis by having forcibly ending the lifetime of all locals before the return terminator. But it feels like this would just bloat IR for no benefit since the return terminator implicitly ends all storage anyways.
- This is a known issue, but duplicate StorageLive are idempotent and I judged it was not worth the effort to try deduplicating them.
- Same thing.
- We could fix this, but it would have little benefit in practice. LLVM currently ignores StorageLive anyways and uses the first point where an alloca is used as an implicit StorageLive instead.
There was a problem hiding this comment.
Re 1) It is potentially useful to avoid extending storage lifetime after inlining (neither MIR inliner nor LLVM inliner attempts to turn those implicit ends into explicit ones if there are storage markers / lifetimes already).
Re 4) LLVM does indeed use first use instead of lifetime.start, but only when there is a single pair of lifetime.start and lifetime.end and all direct uses are in-between. Besides LLVM approach is unsound, because it makes no attempt to avoid introducing invalid observable overlaps.
|
I threw rustlantis at this PR and it found that this program: #![feature(custom_mir, core_intrinsics, lazy_get)]
extern crate core;
use core::intrinsics::mir::*;
fn dump_var<T>(val: T) { }
#[custom_mir(dialect = "runtime", phase = "initial")]
pub fn fn1() {
mir!{
let _1: *const bool;
let _2: bool;
let _3: bool;
{
_1 = core::ptr::addr_of!(_3);
*_1 = true;
_2 = _3;
Call(RET = dump_var(Move(_2)), ReturnTo(bb1), UnwindUnreachable())
}
bb1 = {
Call(RET = dump_var(_3), ReturnTo(bb2), UnwindUnreachable())
}
bb2 = {
Return()
}
}
}
pub fn main() {
fn1();
}gets this diff applied: -// MIR for `fn1` before MoveElimination
+// MIR for `fn1` after MoveElimination
fn fn1() -> () {
let mut _0: ();
@@ -7,14 +7,14 @@ fn fn1() -> () {
let mut _3: bool;
bb0: {
- _1 = &raw const _3;
+ _1 = &raw const _2;
(*_1) = const true;
- _2 = copy _3;
+ nop;
_0 = dump_var::<bool>(move _2) -> [return: bb1, unwind unreachable];
}
bb1: {
- _0 = dump_var::<bool>(copy _3) -> [return: bb2, unwind unreachable];
+ _0 = dump_var::<bool>(copy _2) -> [return: bb2, unwind unreachable];
}
bb2: {It seems like the current implementation introduces use of a place after it is moved out of? |
|
That's strange, it should have refused to merge _2 and _3 because their live ranges overlap. I'll look into it. |
|
|
|
The original program has no storage statement which means all locals are implicitly live at the beginning. |
|
Not with the new MIR semantics. The local is only allocated when it is first written to. |
|
Ah... then Rustlantis will need to be patched. |
View all comments
This PR implements rust-lang/rfcs#3943: a MIR optimization pass that eliminates unnecessary copies. Since the new pass relies on the new MIR semantics from RFC 3943, it is gated behind the the
-Z mir-move-eliminationflag. Enabling this flag also disables the DestinationPropagation pass, which is completely superseded by this one.There are 3 main parts to this optimization:
The RFC text and the top-level comments for the various passes have more details on the internals of the optimization.
r? @dianqk
cc @rust-lang/wg-mir-opt