11/* SPDX-License-Identifier: GPL-2.0 */
22#ifndef _X86_POSTED_INTR_H
33#define _X86_POSTED_INTR_H
4+
5+ #include <asm/cmpxchg.h>
6+ #include <asm/rwonce.h>
47#include <asm/irq_vectors.h>
58
9+ #include <linux/bitmap.h>
10+
611#define POSTED_INTR_ON 0
712#define POSTED_INTR_SN 1
813
914#define PID_TABLE_ENTRY_VALID 1
1015
16+ #define NR_PIR_VECTORS 256
17+ #define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
18+
1119/* Posted-Interrupt Descriptor */
1220struct pi_desc {
13- union {
14- u32 pir [8 ]; /* Posted interrupt requested */
15- u64 pir64 [4 ];
16- };
21+ unsigned long pir [NR_PIR_WORDS ]; /* Posted interrupt requested */
1722 union {
1823 struct {
1924 u16 notifications ; /* Suppress and outstanding bits */
@@ -26,6 +31,65 @@ struct pi_desc {
2631 u32 rsvd [6 ];
2732} __aligned (64 );
2833
34+ /*
35+ * De-multiplexing posted interrupts is on the performance path, the code
36+ * below is written to optimize the cache performance based on the following
37+ * considerations:
38+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
39+ * accessed by both CPU and IOMMU.
40+ * 2.During software processing of posted interrupts, the CPU needs to do
41+ * natural width read and xchg for checking and clearing posted interrupt
42+ * request (PIR), a 256 bit field within the PID.
43+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
44+ * line when posting interrupts and setting control bits.
45+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
46+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
47+ * cache line. The cache line states after each operation are as follows,
48+ * assuming a 64-bit kernel:
49+ * CPU IOMMU PID Cache line state
50+ * ---------------------------------------------------------------
51+ *...read64 exclusive
52+ *...lock xchg64 modified
53+ *... post/atomic swap invalid
54+ *...-------------------------------------------------------------
55+ *
56+ * To reduce L1 data cache miss, it is important to avoid contention with
57+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
58+ * when processing posted interrupts in software, e.g. to dispatch interrupt
59+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
60+ * in KVM.
61+ *
62+ * In addition, the code is trying to keep the cache line state consistent
63+ * as much as possible. e.g. when making a copy and clearing the PIR
64+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
65+ * read, read, read, read, xchg, xchg, xchg, xchg
66+ * instead of:
67+ * read, xchg, read, xchg, read, xchg, read, xchg
68+ */
69+ static __always_inline bool pi_harvest_pir (unsigned long * pir ,
70+ unsigned long * pir_vals )
71+ {
72+ unsigned long pending = 0 ;
73+ int i ;
74+
75+ for (i = 0 ; i < NR_PIR_WORDS ; i ++ ) {
76+ pir_vals [i ] = READ_ONCE (pir [i ]);
77+ pending |= pir_vals [i ];
78+ }
79+
80+ if (!pending )
81+ return false;
82+
83+ for (i = 0 ; i < NR_PIR_WORDS ; i ++ ) {
84+ if (!pir_vals [i ])
85+ continue ;
86+
87+ pir_vals [i ] = arch_xchg (& pir [i ], 0 );
88+ }
89+
90+ return true;
91+ }
92+
2993static inline bool pi_test_and_set_on (struct pi_desc * pi_desc )
3094{
3195 return test_and_set_bit (POSTED_INTR_ON , (unsigned long * )& pi_desc -> control );
@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
43107
44108static inline bool pi_test_and_set_pir (int vector , struct pi_desc * pi_desc )
45109{
46- return test_and_set_bit (vector , ( unsigned long * ) pi_desc -> pir );
110+ return test_and_set_bit (vector , pi_desc -> pir );
47111}
48112
49113static inline bool pi_is_pir_empty (struct pi_desc * pi_desc )
50114{
51- return bitmap_empty (( unsigned long * ) pi_desc -> pir , NR_VECTORS );
115+ return bitmap_empty (pi_desc -> pir , NR_VECTORS );
52116}
53117
54118static inline void pi_set_sn (struct pi_desc * pi_desc )
@@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
110174 if (WARN_ON_ONCE (vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR ))
111175 return false;
112176
113- return test_bit (vector , ( unsigned long * ) pid -> pir );
177+ return test_bit (vector , pid -> pir );
114178}
115179
116180extern void intel_posted_msi_init (void );
0 commit comments