@@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
888888 return err ? err : IRQ_SET_MASK_OK ;
889889}
890890
891+ static void free_moved_vector (struct apic_chip_data * apicd )
892+ {
893+ unsigned int vector = apicd -> prev_vector ;
894+ unsigned int cpu = apicd -> prev_cpu ;
895+ bool managed = apicd -> is_managed ;
896+
897+ /*
898+ * Managed interrupts are usually not migrated away
899+ * from an online CPU, but CPU isolation 'managed_irq'
900+ * can make that happen.
901+ * 1) Activation does not take the isolation into account
902+ * to keep the code simple
903+ * 2) Migration away from an isolated CPU can happen when
904+ * a non-isolated CPU which is in the calculated
905+ * affinity mask comes online.
906+ */
907+ trace_vector_free_moved (apicd -> irq , cpu , vector , managed );
908+ irq_matrix_free (vector_matrix , cpu , vector , managed );
909+ per_cpu (vector_irq , cpu )[vector ] = VECTOR_UNUSED ;
910+ hlist_del_init (& apicd -> clist );
911+ apicd -> prev_vector = 0 ;
912+ apicd -> move_in_progress = 0 ;
913+ }
914+
915+ /*
916+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
917+ */
918+ static void apic_force_complete_move (struct irq_data * irqd )
919+ {
920+ unsigned int cpu = smp_processor_id ();
921+ struct apic_chip_data * apicd ;
922+ unsigned int vector ;
923+
924+ guard (raw_spinlock )(& vector_lock );
925+ apicd = apic_chip_data (irqd );
926+ if (!apicd )
927+ return ;
928+
929+ /*
930+ * If prev_vector is empty or the descriptor is neither currently
931+ * nor previously on the outgoing CPU no action required.
932+ */
933+ vector = apicd -> prev_vector ;
934+ if (!vector || (apicd -> cpu != cpu && apicd -> prev_cpu != cpu ))
935+ return ;
936+
937+ /*
938+ * This is tricky. If the cleanup of the old vector has not been
939+ * done yet, then the following setaffinity call will fail with
940+ * -EBUSY. This can leave the interrupt in a stale state.
941+ *
942+ * All CPUs are stuck in stop machine with interrupts disabled so
943+ * calling __irq_complete_move() would be completely pointless.
944+ *
945+ * 1) The interrupt is in move_in_progress state. That means that we
946+ * have not seen an interrupt since the io_apic was reprogrammed to
947+ * the new vector.
948+ *
949+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
950+ * have not been processed yet.
951+ */
952+ if (apicd -> move_in_progress ) {
953+ /*
954+ * In theory there is a race:
955+ *
956+ * set_ioapic(new_vector) <-- Interrupt is raised before update
957+ * is effective, i.e. it's raised on
958+ * the old vector.
959+ *
960+ * So if the target cpu cannot handle that interrupt before
961+ * the old vector is cleaned up, we get a spurious interrupt
962+ * and in the worst case the ioapic irq line becomes stale.
963+ *
964+ * But in case of cpu hotplug this should be a non issue
965+ * because if the affinity update happens right before all
966+ * cpus rendezvous in stop machine, there is no way that the
967+ * interrupt can be blocked on the target cpu because all cpus
968+ * loops first with interrupts enabled in stop machine, so the
969+ * old vector is not yet cleaned up when the interrupt fires.
970+ *
971+ * So the only way to run into this issue is if the delivery
972+ * of the interrupt on the apic/system bus would be delayed
973+ * beyond the point where the target cpu disables interrupts
974+ * in stop machine. I doubt that it can happen, but at least
975+ * there is a theoretical chance. Virtualization might be
976+ * able to expose this, but AFAICT the IOAPIC emulation is not
977+ * as stupid as the real hardware.
978+ *
979+ * Anyway, there is nothing we can do about that at this point
980+ * w/o refactoring the whole fixup_irq() business completely.
981+ * We print at least the irq number and the old vector number,
982+ * so we have the necessary information when a problem in that
983+ * area arises.
984+ */
985+ pr_warn ("IRQ fixup: irq %d move in progress, old vector %d\n" ,
986+ irqd -> irq , vector );
987+ }
988+ free_moved_vector (apicd );
989+ }
990+
891991#else
892- # define apic_set_affinity NULL
992+ # define apic_set_affinity NULL
993+ # define apic_force_complete_move NULL
893994#endif
894995
895996static int apic_retrigger_irq (struct irq_data * irqd )
@@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data,
9231024}
9241025
9251026static struct irq_chip lapic_controller = {
926- .name = "APIC" ,
927- .irq_ack = apic_ack_edge ,
928- .irq_set_affinity = apic_set_affinity ,
929- .irq_compose_msi_msg = x86_vector_msi_compose_msg ,
930- .irq_retrigger = apic_retrigger_irq ,
1027+ .name = "APIC" ,
1028+ .irq_ack = apic_ack_edge ,
1029+ .irq_set_affinity = apic_set_affinity ,
1030+ .irq_compose_msi_msg = x86_vector_msi_compose_msg ,
1031+ .irq_force_complete_move = apic_force_complete_move ,
1032+ .irq_retrigger = apic_retrigger_irq ,
9311033};
9321034
9331035#ifdef CONFIG_SMP
9341036
935- static void free_moved_vector (struct apic_chip_data * apicd )
936- {
937- unsigned int vector = apicd -> prev_vector ;
938- unsigned int cpu = apicd -> prev_cpu ;
939- bool managed = apicd -> is_managed ;
940-
941- /*
942- * Managed interrupts are usually not migrated away
943- * from an online CPU, but CPU isolation 'managed_irq'
944- * can make that happen.
945- * 1) Activation does not take the isolation into account
946- * to keep the code simple
947- * 2) Migration away from an isolated CPU can happen when
948- * a non-isolated CPU which is in the calculated
949- * affinity mask comes online.
950- */
951- trace_vector_free_moved (apicd -> irq , cpu , vector , managed );
952- irq_matrix_free (vector_matrix , cpu , vector , managed );
953- per_cpu (vector_irq , cpu )[vector ] = VECTOR_UNUSED ;
954- hlist_del_init (& apicd -> clist );
955- apicd -> prev_vector = 0 ;
956- apicd -> move_in_progress = 0 ;
957- }
958-
9591037static void __vector_cleanup (struct vector_cleanup * cl , bool check_irr )
9601038{
9611039 struct apic_chip_data * apicd ;
@@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
10681146 __vector_schedule_cleanup (apicd );
10691147}
10701148
1071- /*
1072- * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
1073- */
1074- void irq_force_complete_move (struct irq_desc * desc )
1075- {
1076- unsigned int cpu = smp_processor_id ();
1077- struct apic_chip_data * apicd ;
1078- struct irq_data * irqd ;
1079- unsigned int vector ;
1080-
1081- /*
1082- * The function is called for all descriptors regardless of which
1083- * irqdomain they belong to. For example if an IRQ is provided by
1084- * an irq_chip as part of a GPIO driver, the chip data for that
1085- * descriptor is specific to the irq_chip in question.
1086- *
1087- * Check first that the chip_data is what we expect
1088- * (apic_chip_data) before touching it any further.
1089- */
1090- irqd = irq_domain_get_irq_data (x86_vector_domain ,
1091- irq_desc_get_irq (desc ));
1092- if (!irqd )
1093- return ;
1094-
1095- raw_spin_lock (& vector_lock );
1096- apicd = apic_chip_data (irqd );
1097- if (!apicd )
1098- goto unlock ;
1099-
1100- /*
1101- * If prev_vector is empty or the descriptor is neither currently
1102- * nor previously on the outgoing CPU no action required.
1103- */
1104- vector = apicd -> prev_vector ;
1105- if (!vector || (apicd -> cpu != cpu && apicd -> prev_cpu != cpu ))
1106- goto unlock ;
1107-
1108- /*
1109- * This is tricky. If the cleanup of the old vector has not been
1110- * done yet, then the following setaffinity call will fail with
1111- * -EBUSY. This can leave the interrupt in a stale state.
1112- *
1113- * All CPUs are stuck in stop machine with interrupts disabled so
1114- * calling __irq_complete_move() would be completely pointless.
1115- *
1116- * 1) The interrupt is in move_in_progress state. That means that we
1117- * have not seen an interrupt since the io_apic was reprogrammed to
1118- * the new vector.
1119- *
1120- * 2) The interrupt has fired on the new vector, but the cleanup IPIs
1121- * have not been processed yet.
1122- */
1123- if (apicd -> move_in_progress ) {
1124- /*
1125- * In theory there is a race:
1126- *
1127- * set_ioapic(new_vector) <-- Interrupt is raised before update
1128- * is effective, i.e. it's raised on
1129- * the old vector.
1130- *
1131- * So if the target cpu cannot handle that interrupt before
1132- * the old vector is cleaned up, we get a spurious interrupt
1133- * and in the worst case the ioapic irq line becomes stale.
1134- *
1135- * But in case of cpu hotplug this should be a non issue
1136- * because if the affinity update happens right before all
1137- * cpus rendezvous in stop machine, there is no way that the
1138- * interrupt can be blocked on the target cpu because all cpus
1139- * loops first with interrupts enabled in stop machine, so the
1140- * old vector is not yet cleaned up when the interrupt fires.
1141- *
1142- * So the only way to run into this issue is if the delivery
1143- * of the interrupt on the apic/system bus would be delayed
1144- * beyond the point where the target cpu disables interrupts
1145- * in stop machine. I doubt that it can happen, but at least
1146- * there is a theoretical chance. Virtualization might be
1147- * able to expose this, but AFAICT the IOAPIC emulation is not
1148- * as stupid as the real hardware.
1149- *
1150- * Anyway, there is nothing we can do about that at this point
1151- * w/o refactoring the whole fixup_irq() business completely.
1152- * We print at least the irq number and the old vector number,
1153- * so we have the necessary information when a problem in that
1154- * area arises.
1155- */
1156- pr_warn ("IRQ fixup: irq %d move in progress, old vector %d\n" ,
1157- irqd -> irq , vector );
1158- }
1159- free_moved_vector (apicd );
1160- unlock :
1161- raw_spin_unlock (& vector_lock );
1162- }
1163-
11641149#ifdef CONFIG_HOTPLUG_CPU
11651150/*
11661151 * Note, this is not accurate accounting, but at least good enough to
0 commit comments