@@ -69,6 +69,15 @@ struct nmi_stats {
6969 unsigned int unknown ;
7070 unsigned int external ;
7171 unsigned int swallow ;
72+ unsigned long recv_jiffies ;
73+ unsigned long idt_seq ;
74+ unsigned long idt_nmi_seq ;
75+ unsigned long idt_ignored ;
76+ atomic_long_t idt_calls ;
77+ unsigned long idt_seq_snap ;
78+ unsigned long idt_nmi_seq_snap ;
79+ unsigned long idt_ignored_snap ;
80+ long idt_calls_snap ;
7281};
7382
7483static DEFINE_PER_CPU (struct nmi_stats , nmi_stats ) ;
@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
479488DEFINE_IDTENTRY_RAW (exc_nmi )
480489{
481490 irqentry_state_t irq_state ;
491+ struct nmi_stats * nsp = this_cpu_ptr (& nmi_stats );
482492
483493 /*
484494 * Re-enable NMIs right here when running as an SEV-ES guest. This might
485495 * cause nested NMIs, but those can be handled safely.
486496 */
487497 sev_es_nmi_complete ();
498+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU ))
499+ arch_atomic_long_inc (& nsp -> idt_calls );
488500
489501 if (IS_ENABLED (CONFIG_SMP ) && arch_cpu_is_offline (smp_processor_id ()))
490502 return ;
@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
495507 }
496508 this_cpu_write (nmi_state , NMI_EXECUTING );
497509 this_cpu_write (nmi_cr2 , read_cr2 ());
510+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
511+ WRITE_ONCE (nsp -> idt_seq , nsp -> idt_seq + 1 );
512+ WARN_ON_ONCE (!(nsp -> idt_seq & 0x1 ));
513+ WRITE_ONCE (nsp -> recv_jiffies , jiffies );
514+ }
498515nmi_restart :
499516
500517 /*
@@ -509,8 +526,19 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
509526
510527 inc_irq_stat (__nmi_count );
511528
512- if (!ignore_nmis )
529+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU ) && ignore_nmis ) {
530+ WRITE_ONCE (nsp -> idt_ignored , nsp -> idt_ignored + 1 );
531+ } else if (!ignore_nmis ) {
532+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
533+ WRITE_ONCE (nsp -> idt_nmi_seq , nsp -> idt_nmi_seq + 1 );
534+ WARN_ON_ONCE (!(nsp -> idt_nmi_seq & 0x1 ));
535+ }
513536 default_do_nmi (regs );
537+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
538+ WRITE_ONCE (nsp -> idt_nmi_seq , nsp -> idt_nmi_seq + 1 );
539+ WARN_ON_ONCE (nsp -> idt_nmi_seq & 0x1 );
540+ }
541+ }
514542
515543 irqentry_nmi_exit (regs , irq_state );
516544
@@ -525,6 +553,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
525553
526554 if (user_mode (regs ))
527555 mds_user_clear_cpu_buffers ();
556+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
557+ WRITE_ONCE (nsp -> idt_seq , nsp -> idt_seq + 1 );
558+ WARN_ON_ONCE (nsp -> idt_seq & 0x1 );
559+ WRITE_ONCE (nsp -> recv_jiffies , jiffies );
560+ }
528561}
529562
530563#if defined(CONFIG_X86_64 ) && IS_ENABLED (CONFIG_KVM_INTEL )
@@ -537,6 +570,79 @@ DEFINE_IDTENTRY_RAW(exc_nmi_noist)
537570EXPORT_SYMBOL_GPL (asm_exc_nmi_noist );
538571#endif
539572
573+ #ifdef CONFIG_NMI_CHECK_CPU
574+
575+ static char * nmi_check_stall_msg [] = {
576+ /* */
577+ /* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
578+ /* | +------ cpu_is_offline(cpu) */
579+ /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
580+ /* | | | NMI handler has been invoked. */
581+ /* | | | */
582+ /* V V V */
583+ /* 0 0 0 */ "NMIs are not reaching exc_nmi() handler" ,
584+ /* 0 0 1 */ "exc_nmi() handler is ignoring NMIs" ,
585+ /* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler" ,
586+ /* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs" ,
587+ /* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler" ,
588+ /* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs" ,
589+ /* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler" ,
590+ /* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs" ,
591+ };
592+
593+ void nmi_backtrace_stall_snap (const struct cpumask * btp )
594+ {
595+ int cpu ;
596+ struct nmi_stats * nsp ;
597+
598+ for_each_cpu (cpu , btp ) {
599+ nsp = per_cpu_ptr (& nmi_stats , cpu );
600+ nsp -> idt_seq_snap = READ_ONCE (nsp -> idt_seq );
601+ nsp -> idt_nmi_seq_snap = READ_ONCE (nsp -> idt_nmi_seq );
602+ nsp -> idt_ignored_snap = READ_ONCE (nsp -> idt_ignored );
603+ nsp -> idt_calls_snap = atomic_long_read (& nsp -> idt_calls );
604+ }
605+ }
606+
607+ void nmi_backtrace_stall_check (const struct cpumask * btp )
608+ {
609+ int cpu ;
610+ int idx ;
611+ unsigned long nmi_seq ;
612+ unsigned long j = jiffies ;
613+ char * modp ;
614+ char * msgp ;
615+ char * msghp ;
616+ struct nmi_stats * nsp ;
617+
618+ for_each_cpu (cpu , btp ) {
619+ nsp = per_cpu_ptr (& nmi_stats , cpu );
620+ modp = "" ;
621+ msghp = "" ;
622+ nmi_seq = READ_ONCE (nsp -> idt_nmi_seq );
623+ if (nsp -> idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1 )) {
624+ msgp = "CPU entered NMI handler function, but has not exited" ;
625+ } else if ((nsp -> idt_nmi_seq_snap & 0x1 ) != (nmi_seq & 0x1 )) {
626+ msgp = "CPU is handling NMIs" ;
627+ } else {
628+ idx = ((nsp -> idt_seq_snap & 0x1 ) << 2 ) |
629+ (cpu_is_offline (cpu ) << 1 ) |
630+ (nsp -> idt_calls_snap != atomic_long_read (& nsp -> idt_calls ));
631+ msgp = nmi_check_stall_msg [idx ];
632+ if (nsp -> idt_ignored_snap != READ_ONCE (nsp -> idt_ignored ) && (idx & 0x1 ))
633+ modp = ", but OK because ignore_nmis was set" ;
634+ if (nmi_seq & ~0x1 )
635+ msghp = " (CPU currently in NMI handler function)" ;
636+ else if (nsp -> idt_nmi_seq_snap + 1 == nmi_seq )
637+ msghp = " (CPU exited one NMI handler function)" ;
638+ }
639+ pr_alert ("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n" ,
640+ __func__ , cpu , msgp , modp , msghp , j - READ_ONCE (nsp -> recv_jiffies ));
641+ }
642+ }
643+
644+ #endif
645+
540646void stop_nmi (void )
541647{
542648 ignore_nmis ++ ;
0 commit comments