Skip to content

Commit 691806e

Browse files
committed
Merge tag 'thermal-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull thermal control updates from Rafael Wysocki: "These include thermal core fixes to protect thermal device operations against thermal device removal, other thermal core fixes and updates of Intel thermal control drivers. Specifics: - Fix race conditions related to thermal device operations that are not protected against thermal device removal (Guenter Roeck) - Fix error code in __thermal_cooling_device_register() (Dan Carpenter) - Validate new cooling device state (coming from user space) in cur_state_store() and reuse the max_state value from cooling device structure in the sysfs interface (Viresh Kumar) - Fix some possible name leaks in error paths in the thermal control core code (Yang Yingliang) - Detect TCC lock bit set in the intel_tcc_cooling driver and make it refuse to update the TCC offset in that case (Zhang Rui) - Add TCC cooling support for RaptorLake-S (Zhang Rui) - Prevent accidental clearing of HFI status by one of the other drivers using the same status register (Srinivas Pandruvada) - Protect clearing of thermal status bits in Intel thermal control drivers (Srinivas Pandruvada) - Allow the HFI thermal control driver to ACK an HFI event for the previously observed timestamp (Srinivas Pandruvada) - Remove a pointless die_id check from the HFI thermal driver and adjust the definition a data structure used by it (Ricardo Neri)" * tag 'thermal-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: thermal: intel: hfi: Remove a pointless die_id check thermal: core: fix some possible name leaks in error paths thermal: intel: hfi: ACK HFI for the same timestamp thermal: intel: Protect clearing of thermal status bits thermal: intel: Prevent accidental clearing of HFI status thermal/core: Protect thermal device operations against thermal device removal thermal/core: Remove thermal_zone_set_trips() thermal/core: Protect sysfs accesses to thermal operations with thermal zone mutex thermal/core: Protect hwmon accesses to thermal operations with thermal zone mutex thermal/core: Introduce locked version of thermal_zone_device_update thermal/core: Move parameter validation from __thermal_zone_get_temp to thermal_zone_get_temp thermal/core: Ensure that thermal device is registered in thermal_zone_get_temp thermal/core: Delete device under thermal device zone lock thermal/core: Destroy thermal zone device mutex in release function thermal: intel: intel_tcc_cooling: Add TCC cooling support for RaptorLake-S thermal: intel: intel_tcc_cooling: Detect TCC lock bit thermal: intel: hfi: Improve the type of hfi_features::nr_table_pages thermal/core: fix error code in __thermal_cooling_device_register() thermal: sysfs: Reuse cdev->max_state thermal: Validate new state in cur_state_store()
2 parents 456ed86 + 75b15aa commit 691806e

12 files changed

Lines changed: 256 additions & 142 deletions

drivers/thermal/gov_fair_share.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,7 @@ static int get_trip_level(struct thermal_zone_device *tz)
4949
static long get_target_state(struct thermal_zone_device *tz,
5050
struct thermal_cooling_device *cdev, int percentage, int level)
5151
{
52-
unsigned long max_state;
53-
54-
cdev->ops->get_max_state(cdev, &max_state);
55-
56-
return (long)(percentage * level * max_state) / (100 * tz->num_trips);
52+
return (long)(percentage * level * cdev->max_state) / (100 * tz->num_trips);
5753
}
5854

5955
/**

drivers/thermal/intel/intel_hfi.c

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,7 @@
4242

4343
#include "../thermal_core.h"
4444
#include "intel_hfi.h"
45-
46-
#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \
47-
BIT(9) | BIT(11) | BIT(26))
45+
#include "thermal_interrupt.h"
4846

4947
/* Hardware Feedback Interface MSR configuration bits */
5048
#define HW_FEEDBACK_PTR_VALID_BIT BIT(0)
@@ -137,7 +135,7 @@ struct hfi_instance {
137135
* Parameters and supported features that are common to all HFI instances
138136
*/
139137
struct hfi_features {
140-
unsigned int nr_table_pages;
138+
size_t nr_table_pages;
141139
unsigned int cpu_stride;
142140
unsigned int hdr_size;
143141
};
@@ -252,7 +250,7 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
252250
struct hfi_instance *hfi_instance;
253251
int cpu = smp_processor_id();
254252
struct hfi_cpu_info *info;
255-
u64 new_timestamp;
253+
u64 new_timestamp, msr, hfi;
256254

257255
if (!pkg_therm_status_msr_val)
258256
return;
@@ -281,9 +279,21 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
281279
if (!raw_spin_trylock(&hfi_instance->event_lock))
282280
return;
283281

284-
/* Skip duplicated updates. */
282+
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr);
283+
hfi = msr & PACKAGE_THERM_STATUS_HFI_UPDATED;
284+
if (!hfi) {
285+
raw_spin_unlock(&hfi_instance->event_lock);
286+
return;
287+
}
288+
289+
/*
290+
* Ack duplicate update. Since there is an active HFI
291+
* status from HW, it must be a new event, not a case
292+
* where a lagging CPU entered the locked region.
293+
*/
285294
new_timestamp = *(u64 *)hfi_instance->hw_table;
286295
if (*hfi_instance->timestamp == new_timestamp) {
296+
thermal_clear_package_intr_status(PACKAGE_LEVEL, PACKAGE_THERM_STATUS_HFI_UPDATED);
287297
raw_spin_unlock(&hfi_instance->event_lock);
288298
return;
289299
}
@@ -297,16 +307,14 @@ void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
297307
memcpy(hfi_instance->local_table, hfi_instance->hw_table,
298308
hfi_features.nr_table_pages << PAGE_SHIFT);
299309

300-
raw_spin_unlock(&hfi_instance->table_lock);
301-
raw_spin_unlock(&hfi_instance->event_lock);
302-
303310
/*
304311
* Let hardware know that we are done reading the HFI table and it is
305312
* free to update it again.
306313
*/
307-
pkg_therm_status_msr_val &= THERM_STATUS_CLEAR_PKG_MASK &
308-
~PACKAGE_THERM_STATUS_HFI_UPDATED;
309-
wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, pkg_therm_status_msr_val);
314+
thermal_clear_package_intr_status(PACKAGE_LEVEL, PACKAGE_THERM_STATUS_HFI_UPDATED);
315+
316+
raw_spin_unlock(&hfi_instance->table_lock);
317+
raw_spin_unlock(&hfi_instance->event_lock);
310318

311319
queue_delayed_work(hfi_updates_wq, &hfi_instance->update_work,
312320
HFI_UPDATE_INTERVAL);
@@ -371,7 +379,7 @@ void intel_hfi_online(unsigned int cpu)
371379
die_id = topology_logical_die_id(cpu);
372380
hfi_instance = info->hfi_instance;
373381
if (!hfi_instance) {
374-
if (die_id < 0 || die_id >= max_hfi_instances)
382+
if (die_id >= max_hfi_instances)
375383
return;
376384

377385
hfi_instance = &hfi_instances[die_id];

drivers/thermal/intel/intel_tcc_cooling.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define TCC_SHIFT 24
1515
#define TCC_MASK (0x3fULL<<24)
1616
#define TCC_PROGRAMMABLE BIT(30)
17+
#define TCC_LOCKED BIT(31)
1718

1819
static struct thermal_cooling_device *tcc_cdev;
1920

@@ -84,6 +85,7 @@ static const struct x86_cpu_id tcc_ids[] __initconst = {
8485
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, NULL),
8586
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL),
8687
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL),
88+
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, NULL),
8789
{}
8890
};
8991

@@ -108,6 +110,15 @@ static int __init tcc_cooling_init(void)
108110
if (!(val & TCC_PROGRAMMABLE))
109111
return -ENODEV;
110112

113+
err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val);
114+
if (err)
115+
return err;
116+
117+
if (val & TCC_LOCKED) {
118+
pr_info("TCC Offset locked\n");
119+
return -ENODEV;
120+
}
121+
111122
pr_info("Programmable TCC Offset detected\n");
112123

113124
tcc_cdev =

drivers/thermal/intel/therm_throt.c

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -190,32 +190,33 @@ static const struct attribute_group thermal_attr_group = {
190190
};
191191
#endif /* CONFIG_SYSFS */
192192

193-
#define CORE_LEVEL 0
194-
#define PACKAGE_LEVEL 1
195-
196193
#define THERM_THROT_POLL_INTERVAL HZ
197194
#define THERM_STATUS_PROCHOT_LOG BIT(1)
198195

199196
#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
200-
#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
197+
#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(26))
201198

202-
static void clear_therm_status_log(int level)
199+
/*
200+
* Clear the bits in package thermal status register for bit = 1
201+
* in bitmask
202+
*/
203+
void thermal_clear_package_intr_status(int level, u64 bit_mask)
203204
{
205+
u64 msr_val;
204206
int msr;
205-
u64 mask, msr_val;
206207

207208
if (level == CORE_LEVEL) {
208209
msr = MSR_IA32_THERM_STATUS;
209-
mask = THERM_STATUS_CLEAR_CORE_MASK;
210+
msr_val = THERM_STATUS_CLEAR_CORE_MASK;
210211
} else {
211212
msr = MSR_IA32_PACKAGE_THERM_STATUS;
212-
mask = THERM_STATUS_CLEAR_PKG_MASK;
213+
msr_val = THERM_STATUS_CLEAR_PKG_MASK;
213214
}
214215

215-
rdmsrl(msr, msr_val);
216-
msr_val &= mask;
217-
wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
216+
msr_val &= ~bit_mask;
217+
wrmsrl(msr, msr_val);
218218
}
219+
EXPORT_SYMBOL_GPL(thermal_clear_package_intr_status);
219220

220221
static void get_therm_status(int level, bool *proc_hot, u8 *temp)
221222
{
@@ -295,7 +296,7 @@ static void __maybe_unused throttle_active_work(struct work_struct *work)
295296
state->average = avg;
296297

297298
re_arm:
298-
clear_therm_status_log(state->level);
299+
thermal_clear_package_intr_status(state->level, THERM_STATUS_PROCHOT_LOG);
299300
schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
300301
}
301302

drivers/thermal/intel/thermal_interrupt.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
#ifndef _INTEL_THERMAL_INTERRUPT_H
33
#define _INTEL_THERMAL_INTERRUPT_H
44

5+
#define CORE_LEVEL 0
6+
#define PACKAGE_LEVEL 1
7+
58
/* Interrupt Handler for package thermal thresholds */
69
extern int (*platform_thermal_package_notify)(__u64 msr_val);
710

@@ -15,4 +18,7 @@ extern bool (*platform_thermal_package_rate_control)(void);
1518
/* Handle HWP interrupt */
1619
extern void notify_hwp_interrupt(void);
1720

21+
/* Common function to clear Package thermal status register */
22+
extern void thermal_clear_package_intr_status(int level, u64 bit_mask);
23+
1824
#endif /* _INTEL_THERMAL_INTERRUPT_H */

drivers/thermal/intel/x86_pkg_temp_thermal.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,6 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
265265
struct thermal_zone_device *tzone = NULL;
266266
int cpu = smp_processor_id();
267267
struct zone_device *zonedev;
268-
u64 msr_val, wr_val;
269268

270269
mutex_lock(&thermal_zone_mutex);
271270
raw_spin_lock_irq(&pkg_temp_lock);
@@ -279,12 +278,8 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
279278
}
280279
zonedev->work_scheduled = false;
281280

282-
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
283-
wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
284-
if (wr_val != msr_val) {
285-
wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
286-
tzone = zonedev->tzone;
287-
}
281+
thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
282+
tzone = zonedev->tzone;
288283

289284
enable_pkg_thres_interrupt();
290285
raw_spin_unlock_irq(&pkg_temp_lock);

0 commit comments

Comments
 (0)