kernel-brax3-ubuntu-touch/drivers/misc/mediatek/sched/fair.c
erascape f319b992b1 kernel-5.15: Initial import brax3 UT kernel
* halium configs enabled

Signed-off-by: erascape <erascape@proton.me>
2025-09-23 15:17:10 +00:00

1286 lines
35 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2021 MediaTek Inc.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <trace/hooks/sched.h>
#include <sched/sched.h>
#include "eas/eas_plus.h"
#include "sugov/cpufreq.h"
#if IS_ENABLED(CONFIG_MTK_GEARLESS_SUPPORT)
#include "mtk_energy_model/v2/energy_model.h"
#else
#include "mtk_energy_model/v1/energy_model.h"
#endif
#include "common.h"
#include <sched/pelt.h>
#include <linux/stop_machine.h>
#include <linux/kthread.h>
#if IS_ENABLED(CONFIG_MTK_THERMAL_INTERFACE)
#include <thermal_interface.h>
#endif
#define CREATE_TRACE_POINTS
#include "sched_trace.h"
MODULE_LICENSE("GPL");
/*
* Unsigned subtract and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define sub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
res = var - val; \
if (res > var) \
res = 0; \
WRITE_ONCE(*ptr, res); \
} while (0)
/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
* and is thus optimized for local variable updates.
*/
#define lsub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
#ifdef CONFIG_SMP
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
{
if (sched_feat(UTIL_EST) && is_util_est_enable())
return max(task_util(p), _task_util_est(p));
return task_util(p);
}
#ifdef CONFIG_UCLAMP_TASK
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return clamp(task_util_est(p),
uclamp_eff_value(p, UCLAMP_MIN),
uclamp_eff_value(p, UCLAMP_MAX));
}
#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return task_util_est(p);
}
#endif
int task_fits_capacity(struct task_struct *p, long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST) && is_util_est_enable())
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
#if IS_ENABLED(CONFIG_MTK_EAS)
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST) && is_util_est_enable()) {
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
}
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
* input:
* util_freq = READ_ONCE(cfs_rq->avg.util_avg);
*
* if (sched_feat(UTIL_EST)) {
* util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
* }
*/
static unsigned long mtk_cpu_util_next(int cpu, struct task_struct *p, int dst_cpu,
unsigned long util_freq, unsigned long util_est)
{
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util_freq, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util_freq += task_util(p);
if (sched_feat(UTIL_EST) && is_util_est_enable()) {
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util_freq = max(util_freq, util_est);
}
return min(util_freq, capacity_orig_of(cpu));
}
/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
* return the delta energy of put task p in dst_cpu
*/
static unsigned long
mtk_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd,
unsigned long min_cap, unsigned long max_cap)
{
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util_base = 0, max_util_cur = 0;
unsigned long cpu_energy_util, sum_util_base = 0, sum_util_cur = 0;
unsigned long _cpu_cap = cpu_cap;
unsigned long energy_base = 0, energy_cur = 0, energy_delta = 0;
int cpu;
int cpu_temp[NR_CPUS];
_cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
* utilization of these CPUs too by masking pd with cpu_online_mask
* instead of the rd span.
*
* If an entire pd is outside of the current rd, it will not appear in
* its pd list and will not be accounted by compute_energy().
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
unsigned long cpu_util_base, cpu_util_cur;
unsigned long util_freq_base, util_freq_cur, util_running_base, util_running_cur;
struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est = 0, util_freq = READ_ONCE(cfs_rq->avg.util_avg);
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
struct util_rq util_rq_energy, util_rq_freq;
#endif
if (sched_feat(UTIL_EST) && is_util_est_enable())
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
util_freq_base = mtk_cpu_util_next(cpu, p, -1, util_freq, util_est);
util_running_base = util_freq_base;
/*
* Busy time computation: utilization clamping is not
* required since the ratio (sum_util / cpu_capacity)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
util_rq_freq.util_cfs = util_freq_base;
util_rq_freq.base = 1;
util_rq_energy.util_cfs = util_running_base;
util_rq_energy.base = 1;
cpu_energy_util = mtk_cpu_util(cpu, &util_rq_energy, cpu_cap,
ENERGY_UTIL, NULL, min_cap, max_cap);
#else
cpu_energy_util = effective_cpu_util(cpu, util_running_base, cpu_cap,
ENERGY_UTIL, NULL);
#endif
sum_util_base += min(cpu_energy_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
cpu_util_base = mtk_cpu_util(cpu, &util_rq_freq, cpu_cap,
FREQUENCY_UTIL, NULL, min_cap, max_cap);
#else
cpu_util_base = effective_cpu_util(cpu, util_freq_base, cpu_cap,
FREQUENCY_UTIL, NULL);
#endif
/*
* When @p is placed on @cpu:
*
* util_running = max(cpu_util, cpu_util_est) +
* max(task_util, _task_util_est)
*
* while cpu_util_next is: max(cpu_util + task_util,
* cpu_util_est + _task_util_est)
*/
if (cpu == dst_cpu) {
util_freq_cur = mtk_cpu_util_next(cpu, p, dst_cpu, util_freq, util_est);
util_running_cur =
mtk_cpu_util_next(cpu, p, -1, util_freq, util_est)
+ task_util_est(p);
/*
* Busy time computation: utilization clamping is not
* required since the ratio (sum_util / cpu_capacity)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
util_rq_freq.util_cfs = util_freq_cur;
util_rq_energy.util_cfs = util_running_cur;
cpu_energy_util = mtk_cpu_util(cpu, &util_rq_energy, cpu_cap,
ENERGY_UTIL, NULL, min_cap, max_cap);
#else
cpu_energy_util = effective_cpu_util(cpu, util_running_cur, cpu_cap,
ENERGY_UTIL, NULL);
#endif
sum_util_cur += min(cpu_energy_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
cpu_util_cur = mtk_cpu_util(cpu, &util_rq_freq, cpu_cap,
FREQUENCY_UTIL, tsk, min_cap, max_cap);
#else
cpu_util_cur = effective_cpu_util(cpu, util_freq_cur, cpu_cap,
FREQUENCY_UTIL, tsk);
#endif
} else {
util_running_cur = util_running_base;
util_freq_cur = util_freq_base;
sum_util_cur += cpu_energy_util;
cpu_util_cur = cpu_util_base;
}
max_util_base = max(max_util_base, min(cpu_util_base, _cpu_cap));
max_util_cur = max(max_util_cur, min(cpu_util_cur, _cpu_cap));
if (trace_sched_energy_util_enabled()) {
trace_sched_energy_util(-1, max_util_base, sum_util_base, cpu,
util_freq_base, util_running_base, cpu_util_base);
trace_sched_energy_util(dst_cpu, max_util_cur, sum_util_cur, cpu,
util_freq_cur, util_running_cur, cpu_util_cur);
}
/* get temperature for each cpu*/
cpu_temp[cpu] = get_cpu_temp(cpu);
cpu_temp[cpu] /= 1000;
}
energy_base = mtk_em_cpu_energy(pd->em_pd, max_util_base, sum_util_base,
_cpu_cap, cpu_temp);
energy_cur = mtk_em_cpu_energy(pd->em_pd, max_util_cur, sum_util_cur,
_cpu_cap, cpu_temp);
energy_delta = energy_cur - energy_base;
if (trace_sched_compute_energy_enabled()) {
trace_sched_compute_energy(-1, pd_mask, energy_base, max_util_base, sum_util_base);
trace_sched_compute_energy(dst_cpu, pd_mask, energy_cur, max_util_cur,
sum_util_cur);
}
return energy_delta;
}
#endif
static unsigned int uclamp_min_ls;
void set_uclamp_min_ls(unsigned int val)
{
uclamp_min_ls = val;
}
EXPORT_SYMBOL_GPL(set_uclamp_min_ls);
unsigned int get_uclamp_min_ls(void)
{
return uclamp_min_ls;
}
EXPORT_SYMBOL_GPL(get_uclamp_min_ls);
/*
* attach_task() -- attach the task detached by detach_task() to its new rq.
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
check_preempt_curr(rq, p, 0);
}
/*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
attach_task(rq, p);
rq_unlock(rq, &rf);
}
#if IS_ENABLED(CONFIG_MTK_EAS)
struct cpumask system_cpumask;
void init_system_cpumask(void)
{
cpumask_copy(&system_cpumask, cpu_possible_mask);
}
void set_system_cpumask(const struct cpumask *srcp)
{
cpumask_copy(&system_cpumask, srcp);
}
EXPORT_SYMBOL_GPL(set_system_cpumask);
void set_system_cpumask_int(unsigned int cpumask_val)
{
struct cpumask cpumask_setting;
unsigned long cpumask_ulval = cpumask_val;
int cpu;
cpumask_clear(&cpumask_setting);
for_each_possible_cpu(cpu) {
if (test_bit(cpu, &cpumask_ulval))
cpumask_set_cpu(cpu, &cpumask_setting);
}
cpumask_copy(&system_cpumask, &cpumask_setting);
}
EXPORT_SYMBOL_GPL(set_system_cpumask_int);
struct cpumask *get_system_cpumask(void)
{
return &system_cpumask;
}
EXPORT_SYMBOL_GPL(get_system_cpumask);
static struct cpumask bcpus;
static unsigned long util_Th;
void get_most_powerful_pd_and_util_Th(void)
{
unsigned int nr_gear = get_nr_gears();
/* no mutliple pd */
if (WARN_ON(nr_gear <= 1)) {
util_Th = 0;
return;
}
/* pd_capacity_tbl is sorted by ascending order,
* so nr_gear-1 is most powerful gear and
* nr_gear is the second powerful gear.
*/
cpumask_copy(&bcpus, get_gear_cpumask(nr_gear-1));
/* threshold is set to large capacity in mcpus */
util_Th = pd_get_opp_capacity(
cpumask_first(get_gear_cpumask(nr_gear-2)), 0);
}
static inline bool task_can_skip_this_cpu(struct task_struct *p, unsigned long p_uclamp_min,
bool latency_sensitive, int cpu, struct cpumask *bcpus)
{
bool cpu_in_bcpus;
unsigned long task_util;
if (latency_sensitive)
return 0;
if (p_uclamp_min > 0)
return 0;
if (cpumask_empty(bcpus))
return 0;
cpu_in_bcpus = cpumask_test_cpu(cpu, bcpus);
task_util = task_util_est(p);
if (!cpu_in_bcpus || !fits_capacity(task_util, util_Th))
return 0;
return 1;
}
int mtk_find_energy_efficient_cpu_in_interrupt(struct task_struct *p, bool latency_sensitive,
struct perf_domain *pd, unsigned long min_cap, unsigned long max_cap)
{
int target_cpu = -1, cpu;
unsigned long cpu_util;
unsigned long pwr, best_pwr = ULONG_MAX, best_idle_pwr = ULONG_MAX;
unsigned long cpu_cap = 0;
unsigned int fit_cpus = 0;
unsigned int idle_cpus = 0;
long max_spare_cap = LONG_MIN, spare_cap, max_spare_cap_per_gear;
int max_spare_cap_cpu = -1, max_spare_cap_cpu_per_gear;
long sys_max_spare_cap = LONG_MIN, idle_max_spare_cap = LONG_MIN;
int sys_max_spare_cap_cpu = -1, idle_max_spare_cap_cpu = -1;
unsigned long util;
bool not_in_softmask;
unsigned int min_exit_lat = UINT_MAX, min_exit_lat_per_gear;
struct cpuidle_state *idle;
int best_idle_cpu = -1, best_idle_cpu_per_gear;
long best_idle_max_spare_cap = LONG_MIN, best_idle_cpu_cap_per_gear;
int this_cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int select_reason = -1;
struct cpumask allowed_cpu_mask;
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
u64 ts[9] = {0};
ts[0] = sched_clock();
#endif
for (; pd; pd = pd->next) {
max_spare_cap_cpu_per_gear = -1;
max_spare_cap_per_gear = LONG_MIN;
min_exit_lat_per_gear = UINT_MAX;
best_idle_cpu_per_gear = -1;
best_idle_cpu_cap_per_gear = LONG_MIN;
for_each_cpu_and(cpu, perf_domain_span(pd), cpu_active_mask) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (cpu_paused(cpu))
continue;
cpumask_set_cpu(cpu, &allowed_cpu_mask);
if (task_can_skip_this_cpu(p, min_cap, latency_sensitive, cpu, &bcpus))
continue;
if (cpu_rq(cpu)->rt.rt_nr_running >= 1 &&
!rt_rq_throttled(&(cpu_rq(cpu)->rt)))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
spare_cap = cpu_cap;
lsub_positive(&spare_cap, util);
not_in_softmask = (latency_sensitive &&
!cpumask_test_cpu(cpu, &system_cpumask));
if (not_in_softmask)
continue;
/* record sys_max_spare_cap_cpu */
if (spare_cap > sys_max_spare_cap) {
sys_max_spare_cap = spare_cap;
sys_max_spare_cap_cpu = cpu;
}
/*
* if there is no best idle cpu, then select max spare cap
* and idle cpu for latency_sensitive task to avoid runnable.
* Because this is just a backup option, we do not take care
* of exit latency.
*/
if (latency_sensitive && idle_cpu(cpu) &&
spare_cap > idle_max_spare_cap) {
idle_max_spare_cap = spare_cap;
idle_max_spare_cap_cpu = cpu;
}
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
* aligned with effective_cpu_util().
*/
cpu_util = mtk_uclamp_rq_util_with(cpu_rq(cpu), util, p, min_cap, max_cap);
if (!fits_capacity(cpu_util, cpu_cap))
continue;
fit_cpus = (fit_cpus | (1 << cpu));
/*
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
if (spare_cap > max_spare_cap_per_gear) {
max_spare_cap_per_gear = spare_cap;
max_spare_cap_cpu_per_gear = cpu;
}
if (!latency_sensitive)
continue;
if (idle_cpu(cpu)) {
idle_cpus = (idle_cpus | (1 << cpu));
idle = idle_get_state(cpu_rq(cpu));
if (idle) {
/* non WFI, find shortest exit_latency */
if (idle->exit_latency < min_exit_lat_per_gear) {
min_exit_lat_per_gear = idle->exit_latency;
best_idle_cpu_per_gear = cpu;
best_idle_cpu_cap_per_gear = spare_cap;
} else if ((idle->exit_latency == min_exit_lat_per_gear)
&& (best_idle_cpu_cap_per_gear < spare_cap)) {
best_idle_cpu_per_gear = cpu;
best_idle_cpu_cap_per_gear = spare_cap;
}
} else {
/* WFI, find max_spare_cap */
if (min_exit_lat_per_gear > 0) {
min_exit_lat_per_gear = 0;
best_idle_cpu_per_gear = cpu;
best_idle_cpu_cap_per_gear = spare_cap;
} else if (best_idle_cpu_cap_per_gear < spare_cap) {
best_idle_cpu_per_gear = cpu;
best_idle_cpu_cap_per_gear = spare_cap;
}
}
}
}
/* no latency_sensitive task, select max_spare_cpu */
if (!latency_sensitive && max_spare_cap_cpu_per_gear >= 0) {
/* calculate power consumption of candidate cpu per gear */
pwr = calc_pwr_eff(max_spare_cap_cpu_per_gear, cpu_util);
/* if cpu power is better, select it as candidate */
if (best_pwr > pwr) {
best_pwr = pwr;
max_spare_cap_cpu = max_spare_cap_cpu_per_gear;
max_spare_cap = max_spare_cap_per_gear;
}
/* if power of two cpus are identical, select larger capacity */
else if ((best_pwr == pwr) && (max_spare_cap < max_spare_cap_per_gear)) {
max_spare_cap_cpu = max_spare_cap_cpu_per_gear;
max_spare_cap = max_spare_cap_per_gear;
}
}
/* latency_sensitive task, select best_idle_cpu (lightest sleep) */
if (latency_sensitive && best_idle_cpu_per_gear >= 0) {
pwr = calc_pwr_eff(best_idle_cpu_per_gear, cpu_util);
if (best_idle_pwr > pwr) {
best_idle_pwr = pwr;
best_idle_cpu = best_idle_cpu_per_gear;
best_idle_max_spare_cap = best_idle_cpu_cap_per_gear;
min_exit_lat = min_exit_lat_per_gear;
}
/* if power of two cpus are identical, select larger capacity */
else if ((best_idle_pwr == pwr)
&& (best_idle_max_spare_cap < best_idle_cpu_cap_per_gear)) {
best_idle_cpu = best_idle_cpu_per_gear;
best_idle_max_spare_cap = best_idle_cpu_cap_per_gear;
min_exit_lat = min_exit_lat_per_gear;
}
}
}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[1] = sched_clock();
#endif
if (latency_sensitive) {
if (best_idle_cpu >= 0) {
/* best idle cpu existed */
target_cpu = best_idle_cpu;
select_reason = LB_LATENCY_SENSITIVE_BEST_IDLE_CPU;
} else if (idle_max_spare_cap_cpu >= 0) {
target_cpu = idle_max_spare_cap_cpu;
select_reason = LB_LATENCY_SENSITIVE_IDLE_MAX_SPARE_CPU;
} else {
target_cpu = sys_max_spare_cap_cpu;
select_reason = LB_LATENCY_SENSITIVE_MAX_SPARE_CPU;
}
goto out;
}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[2] = sched_clock();
#endif
if (max_spare_cap_cpu != -1) {
target_cpu = max_spare_cap_cpu;
select_reason = LB_BEST_ENERGY_CPU;
goto out;
}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[3] = sched_clock();
#endif
/* All cpu failed on !fit_capacity, use sys_max_spare_cap_cpu */
if (sys_max_spare_cap_cpu != -1) {
target_cpu = sys_max_spare_cap_cpu;
select_reason = LB_MAX_SPARE_CPU;
goto out;
}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[4] = sched_clock();
#endif
/*no best_idle_cpu and max_spare_cpu available,
*select this_cpu or prev_cpu with cpu_allowed_mask
*/
if (target_cpu == -1) {
if (cpumask_test_cpu(this_cpu, &allowed_cpu_mask)) {
target_cpu = this_cpu;
select_reason = LB_IRQ_BACKUP_CURR;
goto out;
}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[5] = sched_clock();
#endif
if (cpumask_test_cpu(prev_cpu, &allowed_cpu_mask)) {
target_cpu = prev_cpu;
select_reason = LB_IRQ_BACKUP_PREV;
goto out;
}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[6] = sched_clock();
#endif
/*select cpu in allowed_cpu_mask, not paused, and no rt running */
if (cpumask_empty(&allowed_cpu_mask))
target_cpu = this_cpu;
else
target_cpu = cpumask_any(&allowed_cpu_mask);
select_reason = LB_IRQ_BACKUP_ALLOWED;
}
out:
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[7] = sched_clock();
#endif
if (trace_sched_find_cpu_in_irq_enabled())
trace_sched_find_cpu_in_irq(p, select_reason, target_cpu,
prev_cpu, fit_cpus, idle_cpus,
best_idle_cpu, best_idle_pwr, min_exit_lat,
max_spare_cap_cpu, best_pwr, max_spare_cap);
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
ts[8] = sched_clock();
if ((ts[8] - ts[0] > 1000000ULL) && in_hardirq()) {
int i, i_prev;
u64 prev, curr;
printk_deferred("%s duration %llu, ts[0]=%llu\n", __func__, ts[8] - ts[0], ts[0]);
i_prev = 0;
for (i = 0; i < 8; i++) {
if (ts[i+1]) {
prev = ts[i_prev];
curr = ts[i+1];
printk_deferred("%s ts[%d]=%llu, ts[%d]=%llu, duration=%llu\n",
__func__, i_prev, prev, i+1, curr, curr - prev);
i_prev = i+1;
}
}
}
#endif
return target_cpu;
}
void mtk_find_energy_efficient_cpu(void *data, struct task_struct *p, int prev_cpu, int sync,
int *new_cpu)
{
unsigned long best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
int best_idle_cpu = -1;
long sys_max_spare_cap = LONG_MIN, idle_max_spare_cap = LONG_MIN;
int sys_max_spare_cap_cpu = -1;
int idle_max_spare_cap_cpu = -1;
unsigned long target_cap = 0;
unsigned long cpu_cap, util;
bool latency_sensitive = false;
unsigned int min_exit_lat = UINT_MAX;
int cpu, best_energy_cpu = -1;
struct cpuidle_state *idle;
struct perf_domain *pd;
int select_reason = -1;
unsigned long min_cap = uclamp_eff_value(p, UCLAMP_MIN);
unsigned long max_cap = uclamp_eff_value(p, UCLAMP_MAX);
rcu_read_lock();
if (!uclamp_min_ls)
latency_sensitive = uclamp_latency_sensitive(p);
else {
latency_sensitive = (p->uclamp_req[UCLAMP_MIN].value > 0 ? 1 : 0) ||
uclamp_latency_sensitive(p);
}
if (!latency_sensitive)
latency_sensitive = get_task_idle_prefer_by_task(p);
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized)) {
select_reason = LB_FAIL;
goto unlock;
}
cpu = smp_processor_id();
if (sync && cpu_rq(cpu)->nr_running == 1 &&
cpumask_test_cpu(cpu, p->cpus_ptr) &&
task_fits_capacity(p, capacity_of(cpu)) &&
!(latency_sensitive && !cpumask_test_cpu(cpu, &system_cpumask))) {
rcu_read_unlock();
*new_cpu = cpu;
select_reason = LB_SYNC;
goto done;
}
if (unlikely(in_interrupt())) {
*new_cpu = mtk_find_energy_efficient_cpu_in_interrupt(p, latency_sensitive, pd,
min_cap, max_cap);
rcu_read_unlock();
select_reason = LB_IN_INTERRUPT;
goto done;
}
if (!task_util_est(p)) {
select_reason = LB_ZERO_UTIL;
goto unlock;
}
for (; pd; pd = pd->next) {
unsigned long cur_delta;
long spare_cap, max_spare_cap = LONG_MIN;
unsigned long max_spare_cap_ls_idle = 0;
int max_spare_cap_cpu = -1;
int max_spare_cap_cpu_ls_idle = -1;
#if IS_ENABLED(CONFIG_MTK_THERMAL_AWARE_SCHEDULING)
int cpu_order[NR_CPUS] ____cacheline_aligned, cnt, i;
#endif
#if IS_ENABLED(CONFIG_MTK_THERMAL_AWARE_SCHEDULING)
cnt = sort_thermal_headroom(perf_domain_span(pd), cpu_order);
for (i = 0; i < cnt; i++) {
cpu = cpu_order[i];
#else
for_each_cpu_and(cpu, perf_domain_span(pd), cpu_active_mask) {
#endif
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (cpu_paused(cpu))
continue;
if (cpu_rq(cpu)->rt.rt_nr_running >= 1 &&
!rt_rq_throttled(&(cpu_rq(cpu)->rt)))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
spare_cap = cpu_cap;
lsub_positive(&spare_cap, util);
if ((spare_cap > sys_max_spare_cap) &&
!(latency_sensitive && !cpumask_test_cpu(cpu, &system_cpumask))) {
sys_max_spare_cap = spare_cap;
sys_max_spare_cap_cpu = cpu;
}
if (latency_sensitive && !cpumask_test_cpu(cpu, &system_cpumask))
continue;
/*
* if there is no best idle cpu, then select max spare cap
* and idle cpu for latency_sensitive task to avoid runnable.
* Because this is just a backup option, we do not take care
* of exit latency.
*/
if (latency_sensitive && idle_cpu(cpu) &&
spare_cap > idle_max_spare_cap) {
idle_max_spare_cap = spare_cap;
idle_max_spare_cap_cpu = cpu;
}
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
* aligned with effective_cpu_util().
*/
util = mtk_uclamp_rq_util_with(cpu_rq(cpu), util, p, min_cap, max_cap);
if (!fits_capacity(util, cpu_cap))
continue;
/*
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
if (!latency_sensitive)
continue;
if (idle_cpu(cpu)) {
cpu_cap = capacity_orig_of(cpu);
idle = idle_get_state(cpu_rq(cpu));
#if IS_ENABLED(CONFIG_MTK_THERMAL_AWARE_SCHEDULING)
if (idle && idle->exit_latency >= min_exit_lat &&
cpu_cap == target_cap)
continue;
#else
if (idle && idle->exit_latency > min_exit_lat &&
cpu_cap == target_cap)
continue;
#endif
if (spare_cap < max_spare_cap_ls_idle)
continue;
if (idle)
min_exit_lat = idle->exit_latency;
max_spare_cap_ls_idle = spare_cap;
target_cap = cpu_cap;
max_spare_cap_cpu_ls_idle = cpu;
}
}
/* Evaluate the energy impact of using this CPU. */
if (!latency_sensitive && max_spare_cap_cpu >= 0) {
cur_delta = mtk_compute_energy(p, max_spare_cap_cpu, pd, min_cap, max_cap);
if (cur_delta <= best_delta) {
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
if (latency_sensitive) {
if (max_spare_cap_cpu_ls_idle >= 0) {
cur_delta = mtk_compute_energy(p, max_spare_cap_cpu_ls_idle, pd,
min_cap, max_cap);
if (cur_delta <= best_delta) {
best_delta = cur_delta;
best_idle_cpu = max_spare_cap_cpu_ls_idle;
}
}
}
}
rcu_read_unlock();
if (latency_sensitive) {
if (best_idle_cpu >= 0) {
*new_cpu = best_idle_cpu;
select_reason = LB_LATENCY_SENSITIVE_BEST_IDLE_CPU;
} else if (idle_max_spare_cap_cpu >= 0) {
*new_cpu = idle_max_spare_cap_cpu;
select_reason = LB_LATENCY_SENSITIVE_IDLE_MAX_SPARE_CPU;
} else {
*new_cpu = sys_max_spare_cap_cpu;
select_reason = LB_LATENCY_SENSITIVE_MAX_SPARE_CPU;
}
goto done;
}
/* All cpu failed on !fit_capacity, use sys_max_spare_cap_cpu */
if (best_energy_cpu != -1) {
*new_cpu = best_energy_cpu;
select_reason = LB_BEST_ENERGY_CPU;
goto done;
} else {
*new_cpu = sys_max_spare_cap_cpu;
select_reason = LB_MAX_SPARE_CPU;
goto done;
}
*new_cpu = prev_cpu;
select_reason = LB_PREV;
goto done;
unlock:
rcu_read_unlock();
*new_cpu = -1;
done:
if (trace_sched_find_energy_efficient_cpu_enabled())
trace_sched_find_energy_efficient_cpu(best_delta, best_energy_cpu,
best_idle_cpu, idle_max_spare_cap_cpu, sys_max_spare_cap_cpu);
if (trace_sched_select_task_rq_enabled())
trace_sched_select_task_rq(p, select_reason, prev_cpu, *new_cpu,
task_util(p), task_util_est(p), uclamp_task_util(p),
latency_sensitive, sync);
}
#endif
#endif
#if IS_ENABLED(CONFIG_MTK_EAS)
/* must hold runqueue lock for queue se is currently on */
static struct task_struct *detach_a_hint_task(struct rq *src_rq, int dst_cpu)
{
struct task_struct *p, *best_task = NULL, *backup = NULL;
int dst_capacity;
unsigned int task_util;
bool latency_sensitive = false;
lockdep_assert_rq_held(src_rq);
rcu_read_lock();
dst_capacity = capacity_orig_of(dst_cpu);
list_for_each_entry_reverse(p,
&src_rq->cfs_tasks, se.group_node) {
if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr))
continue;
if (task_running(src_rq, p))
continue;
task_util = uclamp_task_util(p);
if (!uclamp_min_ls)
latency_sensitive = uclamp_latency_sensitive(p);
else {
latency_sensitive = (p->uclamp_req[UCLAMP_MIN].value > 0 ? 1 : 0) ||
uclamp_latency_sensitive(p);
}
if (!latency_sensitive)
latency_sensitive = get_task_idle_prefer_by_task(p);
if (latency_sensitive && !cpumask_test_cpu(dst_cpu, &system_cpumask))
continue;
if (latency_sensitive &&
task_util <= dst_capacity) {
best_task = p;
break;
} else if (latency_sensitive && !backup) {
backup = p;
}
}
p = best_task ? best_task : backup;
if (p) {
/* detach_task */
deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, dst_cpu);
}
rcu_read_unlock();
return p;
}
#endif
inline bool is_task_latency_sensitive(struct task_struct *p)
{
bool latency_sensitive = false;
rcu_read_lock();
if (!uclamp_min_ls)
latency_sensitive = uclamp_latency_sensitive(p);
else {
latency_sensitive = (p->uclamp_req[UCLAMP_MIN].value > 0 ? 1 : 0) ||
uclamp_latency_sensitive(p);
}
if (!latency_sensitive)
latency_sensitive = get_task_idle_prefer_by_task(p);
rcu_read_unlock();
return latency_sensitive;
}
static int mtk_active_load_balance_cpu_stop(void *data)
{
struct task_struct *target_task = data;
int busiest_cpu = smp_processor_id();
struct rq *busiest_rq = cpu_rq(busiest_cpu);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct rq_flags rf;
int deactivated = 0;
local_irq_disable();
raw_spin_lock(&target_task->pi_lock);
rq_lock(busiest_rq, &rf);
if (task_cpu(target_task) != busiest_cpu ||
(!cpumask_test_cpu(target_cpu, target_task->cpus_ptr)) ||
task_running(busiest_rq, target_task) ||
target_rq == busiest_rq)
goto out_unlock;
if (!task_on_rq_queued(target_task))
goto out_unlock;
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
if (cpu_paused(busiest_cpu) || cpu_paused(target_cpu))
goto out_unlock;
/* Make sure the requested CPU hasn't gone down in the meantime: */
if (unlikely(!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
update_rq_clock(busiest_rq);
deactivate_task(busiest_rq, target_task, DEQUEUE_NOCLOCK);
set_task_cpu(target_task, target_cpu);
deactivated = 1;
out_unlock:
busiest_rq->active_balance = 0;
rq_unlock(busiest_rq, &rf);
if (deactivated)
attach_one_task(target_rq, target_task);
raw_spin_unlock(&target_task->pi_lock);
put_task_struct(target_task);
local_irq_enable();
return 0;
}
int migrate_running_task(int this_cpu, struct task_struct *p, struct rq *target, int reason)
{
int active_balance = false;
unsigned long flags;
raw_spin_rq_lock_irqsave(target, flags);
if (!target->active_balance &&
(task_rq(p) == target) && p->__state != TASK_DEAD &&
!(is_task_latency_sensitive(p) && !cpumask_test_cpu(this_cpu, &system_cpumask))) {
target->active_balance = 1;
target->push_cpu = this_cpu;
active_balance = true;
get_task_struct(p);
}
raw_spin_rq_unlock_irqrestore(target, flags);
if (active_balance) {
trace_sched_force_migrate(p, this_cpu, reason);
stop_one_cpu_nowait(cpu_of(target),
mtk_active_load_balance_cpu_stop,
p, &target->active_balance_work);
}
return active_balance;
}
#if IS_ENABLED(CONFIG_MTK_EAS)
static DEFINE_PER_CPU(u64, next_update_new_balance_time_ns);
void mtk_sched_newidle_balance(void *data, struct rq *this_rq, struct rq_flags *rf,
int *pulled_task, int *done)
{
int cpu;
struct rq *src_rq, *misfit_task_rq = NULL;
struct task_struct *p = NULL, *best_running_task = NULL;
struct rq_flags src_rf;
int this_cpu = this_rq->cpu;
unsigned long misfit_load = 0;
u64 now_ns;
if (cpu_paused(this_cpu)) {
*done = 1;
return;
}
/*
* There is a task waiting to run. No need to search for one.
* Return 0; the task will be enqueued when switching to idle.
*/
if (this_rq->ttwu_pending)
return;
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
/*
* Do not pull tasks towards !active CPUs...
*/
if (!cpu_active(this_cpu))
return;
now_ns = ktime_get_real_ns();
if (now_ns < per_cpu(next_update_new_balance_time_ns, this_cpu))
return;
per_cpu(next_update_new_balance_time_ns, this_cpu) =
now_ns + new_idle_balance_interval_ns;
trace_sched_next_new_balance(now_ns, per_cpu(next_update_new_balance_time_ns, this_cpu));
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
rq_unpin_lock(this_rq, rf);
raw_spin_rq_unlock(this_rq);
this_cpu = this_rq->cpu;
for_each_cpu(cpu, cpu_active_mask) {
if (cpu == this_cpu)
continue;
src_rq = cpu_rq(cpu);
rq_lock_irqsave(src_rq, &src_rf);
update_rq_clock(src_rq);
if (src_rq->active_balance) {
rq_unlock_irqrestore(src_rq, &src_rf);
continue;
}
if (src_rq->misfit_task_load > misfit_load &&
capacity_orig_of(this_cpu) > capacity_orig_of(cpu)) {
p = src_rq->curr;
if (p && p->policy == SCHED_NORMAL &&
cpumask_test_cpu(this_cpu, p->cpus_ptr) &&
!(is_task_latency_sensitive(p) &&
!cpumask_test_cpu(this_cpu, &system_cpumask))) {
misfit_task_rq = src_rq;
misfit_load = src_rq->misfit_task_load;
if (best_running_task)
put_task_struct(best_running_task);
best_running_task = p;
get_task_struct(best_running_task);
}
p = NULL;
}
if (src_rq->nr_running <= 1) {
rq_unlock_irqrestore(src_rq, &src_rf);
continue;
}
p = detach_a_hint_task(src_rq, this_cpu);
rq_unlock_irqrestore(src_rq, &src_rf);
if (p) {
trace_sched_force_migrate(p, this_cpu, MIGR_IDLE_BALANCE);
attach_one_task(this_rq, p);
break;
}
}
/*
* If p is null meaning that we have not pull a runnable task, we try to
* pull a latency sensitive running task.
*/
if (!p && misfit_task_rq)
*done = migrate_running_task(this_cpu, best_running_task,
misfit_task_rq, MIGR_IDLE_PULL_MISFIT_RUNNING);
if (best_running_task)
put_task_struct(best_running_task);
raw_spin_rq_lock(this_rq);
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
if (this_rq->cfs.h_nr_running && !*pulled_task)
*pulled_task = 1;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
*pulled_task = -1;
if (*pulled_task)
this_rq->idle_stamp = 0;
if (*pulled_task != 0)
*done = 1;
rq_repin_lock(this_rq, rf);
}
#endif