kernel-brax3-ubuntu-touch/drivers/misc/mediatek/sched/fair.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2021 MediaTek Inc.
 */
#include <linux/module.h>
#include <linux/sched.h>
#include <trace/hooks/sched.h>
#include <sched/sched.h>
#include "eas/eas_plus.h"
#include "sugov/cpufreq.h"
#if IS_ENABLED(CONFIG_MTK_GEARLESS_SUPPORT)
#include "mtk_energy_model/v2/energy_model.h"
#else
#include "mtk_energy_model/v1/energy_model.h"
#endif
#include "common.h"
#include <sched/pelt.h>
#include <linux/stop_machine.h>
#include <linux/kthread.h>
#if IS_ENABLED(CONFIG_MTK_THERMAL_INTERFACE)
#include <thermal_interface.h>
#endif

#define CREATE_TRACE_POINTS
#include "sched_trace.h"

MODULE_LICENSE("GPL");

/*
 * Unsigned subtract and clamp on underflow.
 *
 * Explicitly do a load-store to ensure the intermediate value never hits
 * memory. This allows lockless observations without ever seeing the negative
 * values.
 */
#define sub_positive(_ptr, _val) do {				\
	typeof(_ptr) ptr = (_ptr);				\
	typeof(*ptr) val = (_val);				\
	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
	res = var - val;					\
	if (res > var)						\
		res = 0;					\
	WRITE_ONCE(*ptr, res);					\
} while (0)

/*
 * Remove and clamp on negative, from a local variable.
 *
 * A variant of sub_positive(), which does not use explicit load-store
 * and is thus optimized for local variable updates.
 */
#define lsub_positive(_ptr, _val) do {				\
	typeof(_ptr) ptr = (_ptr);				\
	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
} while (0)

#ifdef CONFIG_SMP
static inline unsigned long task_util(struct task_struct *p)
{
	return READ_ONCE(p->se.avg.util_avg);
}

static inline unsigned long _task_util_est(struct task_struct *p)
{
	struct util_est ue = READ_ONCE(p->se.avg.util_est);

	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}

static inline unsigned long task_util_est(struct task_struct *p)
{
	if (sched_feat(UTIL_EST) && is_util_est_enable())
		return max(task_util(p), _task_util_est(p));
	return task_util(p);
}

#ifdef CONFIG_UCLAMP_TASK
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
	return clamp(task_util_est(p),
		     uclamp_eff_value(p, UCLAMP_MIN),
		     uclamp_eff_value(p, UCLAMP_MAX));
}
#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
	return task_util_est(p);
}
#endif

int task_fits_capacity(struct task_struct *p, long capacity)
{
	return fits_capacity(uclamp_task_util(p), capacity);
}

unsigned long capacity_of(int cpu)
{
	return cpu_rq(cpu)->cpu_capacity;

}

unsigned long cpu_util(int cpu)
{
	struct cfs_rq *cfs_rq;
	unsigned int util;

	cfs_rq = &cpu_rq(cpu)->cfs;
	util = READ_ONCE(cfs_rq->avg.util_avg);

	if (sched_feat(UTIL_EST) && is_util_est_enable())
		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));

	return min_t(unsigned long, util, capacity_orig_of(cpu));
}

#if IS_ENABLED(CONFIG_MTK_EAS)
/*
 * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
 * to @dst_cpu.
 */
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);

	/*
	 * If @p migrates from @cpu to another, remove its contribution. Or,
	 * if @p migrates from another CPU to @cpu, add its contribution. In
	 * the other cases, @cpu is not impacted by the migration, so the
	 * util_avg should already be correct.
	 */
	if (task_cpu(p) == cpu && dst_cpu != cpu)
		lsub_positive(&util, task_util(p));
	else if (task_cpu(p) != cpu && dst_cpu == cpu)
		util += task_util(p);

	if (sched_feat(UTIL_EST) && is_util_est_enable()) {
		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);

		/*
		 * During wake-up, the task isn't enqueued yet and doesn't
		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
		 * so just add it (if needed) to "simulate" what will be
		 * cpu_util() after the task has been enqueued.
		 */
		if (dst_cpu == cpu)
			util_est += _task_util_est(p);

		util = max(util, util_est);
	}

	return min(util, capacity_orig_of(cpu));
}

/*
 * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
 * to @dst_cpu.
 * input:
 * util_freq = READ_ONCE(cfs_rq->avg.util_avg);
 *
 * if (sched_feat(UTIL_EST)) {
 *	util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
 * }
 */
static unsigned long mtk_cpu_util_next(int cpu, struct task_struct *p, int dst_cpu,
					unsigned long util_freq, unsigned long util_est)
{
	/*
	 * If @p migrates from @cpu to another, remove its contribution. Or,
	 * if @p migrates from another CPU to @cpu, add its contribution. In
	 * the other cases, @cpu is not impacted by the migration, so the
	 * util_avg should already be correct.
	 */
	if (task_cpu(p) == cpu && dst_cpu != cpu)
		lsub_positive(&util_freq, task_util(p));
	else if (task_cpu(p) != cpu && dst_cpu == cpu)
		util_freq += task_util(p);

	if (sched_feat(UTIL_EST) && is_util_est_enable()) {

		/*
		 * During wake-up, the task isn't enqueued yet and doesn't
		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
		 * so just add it (if needed) to "simulate" what will be
		 * cpu_util() after the task has been enqueued.
		 */
		if (dst_cpu == cpu)
			util_est += _task_util_est(p);

		util_freq = max(util_freq, util_est);
	}

	return min(util_freq, capacity_orig_of(cpu));
}

/*
 * compute_energy(): Estimates the energy that @pd would consume if @p was
 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
 * to compute what would be the energy if we decided to actually migrate that
 * task.
 * return the delta energy of put task p in dst_cpu
 */
static unsigned long
mtk_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd,
		unsigned long min_cap, unsigned long max_cap)
{
	struct cpumask *pd_mask = perf_domain_span(pd);
	unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
	unsigned long max_util_base = 0, max_util_cur = 0;
	unsigned long cpu_energy_util, sum_util_base = 0, sum_util_cur = 0;
	unsigned long _cpu_cap = cpu_cap;
	unsigned long energy_base = 0, energy_cur = 0, energy_delta = 0;
	int cpu;
	int cpu_temp[NR_CPUS];

	_cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));


	/*
	 * The capacity state of CPUs of the current rd can be driven by CPUs
	 * of another rd if they belong to the same pd. So, account for the
	 * utilization of these CPUs too by masking pd with cpu_online_mask
	 * instead of the rd span.
	 *
	 * If an entire pd is outside of the current rd, it will not appear in
	 * its pd list and will not be accounted by compute_energy().
	 */
	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
		unsigned long cpu_util_base, cpu_util_cur;
		unsigned long util_freq_base, util_freq_cur, util_running_base, util_running_cur;
		struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
		struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
		unsigned long util_est = 0, util_freq = READ_ONCE(cfs_rq->avg.util_avg);
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
		struct util_rq util_rq_energy, util_rq_freq;
#endif

		if (sched_feat(UTIL_EST) && is_util_est_enable())
			util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);

		util_freq_base = mtk_cpu_util_next(cpu, p, -1, util_freq, util_est);
		util_running_base = util_freq_base;

		/*
		 * Busy time computation: utilization clamping is not
		 * required since the ratio (sum_util / cpu_capacity)
		 * is already enough to scale the EM reported power
		 * consumption at the (eventually clamped) cpu_capacity.
		 */
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
		util_rq_freq.util_cfs = util_freq_base;
		util_rq_freq.base = 1;
		util_rq_energy.util_cfs = util_running_base;
		util_rq_energy.base = 1;

		cpu_energy_util = mtk_cpu_util(cpu, &util_rq_energy, cpu_cap,
					       ENERGY_UTIL, NULL, min_cap, max_cap);
#else
		cpu_energy_util = effective_cpu_util(cpu, util_running_base, cpu_cap,
					       ENERGY_UTIL, NULL);
#endif
		sum_util_base += min(cpu_energy_util, _cpu_cap);

		/*
		 * Performance domain frequency: utilization clamping
		 * must be considered since it affects the selection
		 * of the performance domain frequency.
		 * NOTE: in case RT tasks are running, by default the
		 * FREQUENCY_UTIL's utilization can be max OPP.
		 */
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
		cpu_util_base = mtk_cpu_util(cpu, &util_rq_freq, cpu_cap,
					      FREQUENCY_UTIL, NULL, min_cap, max_cap);
#else
		cpu_util_base = effective_cpu_util(cpu, util_freq_base, cpu_cap,
					      FREQUENCY_UTIL, NULL);
#endif
		/*
		 * When @p is placed on @cpu:
		 *
		 * util_running = max(cpu_util, cpu_util_est) +
		 *		  max(task_util, _task_util_est)
		 *
		 * while cpu_util_next is: max(cpu_util + task_util,
		 *			       cpu_util_est + _task_util_est)
		 */
		if (cpu == dst_cpu) {
			util_freq_cur = mtk_cpu_util_next(cpu, p, dst_cpu, util_freq, util_est);
			util_running_cur =
				mtk_cpu_util_next(cpu, p, -1, util_freq, util_est)
				+ task_util_est(p);
			/*
			 * Busy time computation: utilization clamping is not
			 * required since the ratio (sum_util / cpu_capacity)
			 * is already enough to scale the EM reported power
			 * consumption at the (eventually clamped) cpu_capacity.
			 */
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
			util_rq_freq.util_cfs = util_freq_cur;
			util_rq_energy.util_cfs = util_running_cur;

			cpu_energy_util = mtk_cpu_util(cpu, &util_rq_energy, cpu_cap,
					       ENERGY_UTIL, NULL, min_cap, max_cap);
#else
			cpu_energy_util = effective_cpu_util(cpu, util_running_cur, cpu_cap,
					       ENERGY_UTIL, NULL);
#endif
			sum_util_cur += min(cpu_energy_util, _cpu_cap);

			/*
			 * Performance domain frequency: utilization clamping
			 * must be considered since it affects the selection
			 * of the performance domain frequency.
			 * NOTE: in case RT tasks are running, by default the
			 * FREQUENCY_UTIL's utilization can be max OPP.
			 */
#if IS_ENABLED(CONFIG_MTK_CPUFREQ_SUGOV_EXT)
			cpu_util_cur = mtk_cpu_util(cpu, &util_rq_freq, cpu_cap,
					      FREQUENCY_UTIL, tsk, min_cap, max_cap);
#else
			cpu_util_cur = effective_cpu_util(cpu, util_freq_cur, cpu_cap,
					      FREQUENCY_UTIL, tsk);
#endif
		} else {

			util_running_cur = util_running_base;
			util_freq_cur = util_freq_base;
			sum_util_cur += cpu_energy_util;
			cpu_util_cur = cpu_util_base;
		}

		max_util_base = max(max_util_base, min(cpu_util_base, _cpu_cap));
		max_util_cur = max(max_util_cur, min(cpu_util_cur, _cpu_cap));

		if (trace_sched_energy_util_enabled()) {
			trace_sched_energy_util(-1, max_util_base, sum_util_base, cpu,
					util_freq_base, util_running_base, cpu_util_base);
			trace_sched_energy_util(dst_cpu, max_util_cur, sum_util_cur, cpu,
					util_freq_cur, util_running_cur, cpu_util_cur);
		}

		/* get temperature for each cpu*/
		cpu_temp[cpu] = get_cpu_temp(cpu);
		cpu_temp[cpu] /= 1000;
	}

	energy_base = mtk_em_cpu_energy(pd->em_pd, max_util_base, sum_util_base,
		_cpu_cap, cpu_temp);
	energy_cur = mtk_em_cpu_energy(pd->em_pd, max_util_cur, sum_util_cur,
		_cpu_cap, cpu_temp);
	energy_delta = energy_cur - energy_base;

	if (trace_sched_compute_energy_enabled()) {
		trace_sched_compute_energy(-1, pd_mask, energy_base, max_util_base, sum_util_base);
		trace_sched_compute_energy(dst_cpu, pd_mask, energy_cur, max_util_cur,
				sum_util_cur);
	}

	return energy_delta;
}
#endif

static unsigned int uclamp_min_ls;
void set_uclamp_min_ls(unsigned int val)
{
	uclamp_min_ls = val;
}
EXPORT_SYMBOL_GPL(set_uclamp_min_ls);

unsigned int get_uclamp_min_ls(void)
{
	return uclamp_min_ls;
}
EXPORT_SYMBOL_GPL(get_uclamp_min_ls);

/*
 * attach_task() -- attach the task detached by detach_task() to its new rq.
 */
static void attach_task(struct rq *rq, struct task_struct *p)
{
	lockdep_assert_rq_held(rq);

	BUG_ON(task_rq(p) != rq);
	activate_task(rq, p, ENQUEUE_NOCLOCK);
	check_preempt_curr(rq, p, 0);
}

/*
 * attach_one_task() -- attaches the task returned from detach_one_task() to
 * its new rq.
 */
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
	struct rq_flags rf;

	rq_lock(rq, &rf);
	update_rq_clock(rq);
	attach_task(rq, p);
	rq_unlock(rq, &rf);
}

#if IS_ENABLED(CONFIG_MTK_EAS)
struct cpumask system_cpumask;

void init_system_cpumask(void)
{
	cpumask_copy(&system_cpumask, cpu_possible_mask);
}

void set_system_cpumask(const struct cpumask *srcp)
{
	cpumask_copy(&system_cpumask, srcp);
}
EXPORT_SYMBOL_GPL(set_system_cpumask);

void set_system_cpumask_int(unsigned int cpumask_val)
{
	struct cpumask cpumask_setting;
	unsigned long cpumask_ulval = cpumask_val;
	int cpu;

	cpumask_clear(&cpumask_setting);
	for_each_possible_cpu(cpu) {
		if (test_bit(cpu, &cpumask_ulval))
			cpumask_set_cpu(cpu, &cpumask_setting);
	}

	cpumask_copy(&system_cpumask, &cpumask_setting);
}
EXPORT_SYMBOL_GPL(set_system_cpumask_int);

struct cpumask *get_system_cpumask(void)
{
	return &system_cpumask;
}
EXPORT_SYMBOL_GPL(get_system_cpumask);

static struct cpumask bcpus;
static unsigned long util_Th;

void get_most_powerful_pd_and_util_Th(void)
{
	unsigned int nr_gear = get_nr_gears();

	/* no mutliple pd */
	if (WARN_ON(nr_gear <= 1)) {
		util_Th = 0;
		return;
	}

	/* pd_capacity_tbl is sorted by ascending order,
	 * so nr_gear-1 is most powerful gear and
	 * nr_gear is the second powerful gear.
	 */
	cpumask_copy(&bcpus, get_gear_cpumask(nr_gear-1));
	/* threshold is set to large capacity in mcpus */
	util_Th = pd_get_opp_capacity(
		cpumask_first(get_gear_cpumask(nr_gear-2)), 0);

}

static inline bool task_can_skip_this_cpu(struct task_struct *p, unsigned long p_uclamp_min,
		bool latency_sensitive, int cpu, struct cpumask *bcpus)
{
	bool cpu_in_bcpus;
	unsigned long task_util;

	if (latency_sensitive)
		return 0;

	if (p_uclamp_min > 0)
		return 0;

	if (cpumask_empty(bcpus))
		return 0;

	cpu_in_bcpus = cpumask_test_cpu(cpu, bcpus);
	task_util = task_util_est(p);
	if (!cpu_in_bcpus || !fits_capacity(task_util, util_Th))
		return 0;

	return 1;
}

int mtk_find_energy_efficient_cpu_in_interrupt(struct task_struct *p, bool latency_sensitive,
		struct perf_domain *pd, unsigned long min_cap, unsigned long max_cap)
{
	int target_cpu = -1, cpu;
	unsigned long cpu_util;
	unsigned long pwr, best_pwr = ULONG_MAX, best_idle_pwr = ULONG_MAX;
	unsigned long cpu_cap = 0;
	unsigned int fit_cpus = 0;
	unsigned int idle_cpus = 0;
	long max_spare_cap = LONG_MIN, spare_cap, max_spare_cap_per_gear;
	int max_spare_cap_cpu = -1, max_spare_cap_cpu_per_gear;
	long sys_max_spare_cap = LONG_MIN, idle_max_spare_cap = LONG_MIN;
	int sys_max_spare_cap_cpu = -1, idle_max_spare_cap_cpu = -1;
	unsigned long util;
	bool not_in_softmask;
	unsigned int min_exit_lat = UINT_MAX, min_exit_lat_per_gear;
	struct cpuidle_state *idle;
	int best_idle_cpu = -1, best_idle_cpu_per_gear;
	long best_idle_max_spare_cap = LONG_MIN, best_idle_cpu_cap_per_gear;
	int this_cpu = smp_processor_id();
	int prev_cpu = task_cpu(p);
	int select_reason = -1;
	struct cpumask allowed_cpu_mask;
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	u64 ts[9] = {0};

	ts[0] = sched_clock();
#endif

	for (; pd; pd = pd->next) {
		max_spare_cap_cpu_per_gear = -1;
		max_spare_cap_per_gear = LONG_MIN;
		min_exit_lat_per_gear = UINT_MAX;
		best_idle_cpu_per_gear = -1;
		best_idle_cpu_cap_per_gear = LONG_MIN;

		for_each_cpu_and(cpu, perf_domain_span(pd), cpu_active_mask) {

			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
				continue;

			if (cpu_paused(cpu))
				continue;

			cpumask_set_cpu(cpu, &allowed_cpu_mask);

			if (task_can_skip_this_cpu(p, min_cap, latency_sensitive, cpu, &bcpus))
				continue;

			if (cpu_rq(cpu)->rt.rt_nr_running >= 1 &&
					!rt_rq_throttled(&(cpu_rq(cpu)->rt)))
				continue;

			util = cpu_util_next(cpu, p, cpu);
			cpu_cap = capacity_of(cpu);
			spare_cap = cpu_cap;
			lsub_positive(&spare_cap, util);
			not_in_softmask = (latency_sensitive &&
						!cpumask_test_cpu(cpu, &system_cpumask));

			if (not_in_softmask)
				continue;

			/* record sys_max_spare_cap_cpu */
			if (spare_cap > sys_max_spare_cap) {
				sys_max_spare_cap = spare_cap;
				sys_max_spare_cap_cpu = cpu;
			}

			/*
			 * if there is no best idle cpu, then select max spare cap
			 * and idle cpu for latency_sensitive task to avoid runnable.
			 * Because this is just a backup option, we do not take care
			 * of exit latency.
			 */
			if (latency_sensitive && idle_cpu(cpu) &&
					spare_cap > idle_max_spare_cap) {
				idle_max_spare_cap = spare_cap;
				idle_max_spare_cap_cpu = cpu;
			}

			/*
			 * Skip CPUs that cannot satisfy the capacity request.
			 * IOW, placing the task there would make the CPU
			 * overutilized. Take uclamp into account to see how
			 * much capacity we can get out of the CPU; this is
			 * aligned with effective_cpu_util().
			 */
			cpu_util = mtk_uclamp_rq_util_with(cpu_rq(cpu), util, p, min_cap, max_cap);
			if (!fits_capacity(cpu_util, cpu_cap))
				continue;

			fit_cpus = (fit_cpus | (1 << cpu));

			/*
			 * Find the CPU with the maximum spare capacity in
			 * the performance domain
			 */
			if (spare_cap > max_spare_cap_per_gear) {
				max_spare_cap_per_gear = spare_cap;
				max_spare_cap_cpu_per_gear = cpu;
			}

			if (!latency_sensitive)
				continue;

			if (idle_cpu(cpu)) {
				idle_cpus = (idle_cpus | (1 << cpu));
				idle = idle_get_state(cpu_rq(cpu));
				if (idle) {
					/* non WFI, find shortest exit_latency */
					if (idle->exit_latency < min_exit_lat_per_gear) {
						min_exit_lat_per_gear = idle->exit_latency;
						best_idle_cpu_per_gear = cpu;
						best_idle_cpu_cap_per_gear = spare_cap;
					} else if ((idle->exit_latency == min_exit_lat_per_gear)
						&& (best_idle_cpu_cap_per_gear < spare_cap)) {
						best_idle_cpu_per_gear = cpu;
						best_idle_cpu_cap_per_gear = spare_cap;
					}
				} else {
					/* WFI, find max_spare_cap */
					if (min_exit_lat_per_gear > 0) {
						min_exit_lat_per_gear = 0;
						best_idle_cpu_per_gear = cpu;
						best_idle_cpu_cap_per_gear = spare_cap;
					} else if (best_idle_cpu_cap_per_gear < spare_cap) {
						best_idle_cpu_per_gear = cpu;
						best_idle_cpu_cap_per_gear = spare_cap;
					}
				}
			}
		}

		/* no latency_sensitive task, select max_spare_cpu */
		if (!latency_sensitive && max_spare_cap_cpu_per_gear >= 0) {
			/* calculate power consumption of candidate cpu per gear */
			pwr = calc_pwr_eff(max_spare_cap_cpu_per_gear, cpu_util);
			/* if cpu power is better, select it as candidate */
			if (best_pwr > pwr) {
				best_pwr = pwr;
				max_spare_cap_cpu = max_spare_cap_cpu_per_gear;
				max_spare_cap = max_spare_cap_per_gear;
			}
			/* if power of two cpus are identical, select larger capacity */
			else if ((best_pwr == pwr) && (max_spare_cap < max_spare_cap_per_gear)) {
				max_spare_cap_cpu = max_spare_cap_cpu_per_gear;
				max_spare_cap = max_spare_cap_per_gear;
			}
		}

		/* latency_sensitive task, select best_idle_cpu (lightest sleep) */
		if (latency_sensitive && best_idle_cpu_per_gear >= 0) {
			pwr = calc_pwr_eff(best_idle_cpu_per_gear, cpu_util);
			if (best_idle_pwr > pwr) {
				best_idle_pwr = pwr;
				best_idle_cpu = best_idle_cpu_per_gear;
				best_idle_max_spare_cap = best_idle_cpu_cap_per_gear;
				min_exit_lat = min_exit_lat_per_gear;
			}
			/* if power of two cpus are identical, select larger capacity */
			else if ((best_idle_pwr == pwr)
				&& (best_idle_max_spare_cap < best_idle_cpu_cap_per_gear)) {
				best_idle_cpu = best_idle_cpu_per_gear;
				best_idle_max_spare_cap = best_idle_cpu_cap_per_gear;
				min_exit_lat = min_exit_lat_per_gear;
			}
		}
	}

#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	ts[1] = sched_clock();
#endif

	if (latency_sensitive) {
		if (best_idle_cpu >= 0) {
			/* best idle cpu existed */
			target_cpu = best_idle_cpu;
			select_reason = LB_LATENCY_SENSITIVE_BEST_IDLE_CPU;
		} else if (idle_max_spare_cap_cpu >= 0) {
			target_cpu = idle_max_spare_cap_cpu;
			select_reason = LB_LATENCY_SENSITIVE_IDLE_MAX_SPARE_CPU;
		} else {
			target_cpu = sys_max_spare_cap_cpu;
			select_reason = LB_LATENCY_SENSITIVE_MAX_SPARE_CPU;
		}
		goto out;
	}

#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	ts[2] = sched_clock();
#endif

	if (max_spare_cap_cpu != -1) {
		target_cpu = max_spare_cap_cpu;
		select_reason = LB_BEST_ENERGY_CPU;
		goto out;
	}

#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	ts[3] = sched_clock();
#endif

	/* All cpu failed on !fit_capacity, use sys_max_spare_cap_cpu */
	if (sys_max_spare_cap_cpu != -1) {
		target_cpu = sys_max_spare_cap_cpu;
		select_reason = LB_MAX_SPARE_CPU;
		goto out;
	}

#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	ts[4] = sched_clock();
#endif

	/*no best_idle_cpu and max_spare_cpu available,
	 *select this_cpu or prev_cpu with cpu_allowed_mask
	 */
	if (target_cpu == -1) {
		if (cpumask_test_cpu(this_cpu, &allowed_cpu_mask)) {
			target_cpu = this_cpu;
			select_reason = LB_IRQ_BACKUP_CURR;
			goto out;
		}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
		ts[5] = sched_clock();
#endif
		if (cpumask_test_cpu(prev_cpu, &allowed_cpu_mask)) {
			target_cpu = prev_cpu;
			select_reason = LB_IRQ_BACKUP_PREV;
			goto out;
		}
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
		ts[6] = sched_clock();
#endif
		/*select cpu in allowed_cpu_mask, not paused, and no rt running */
		if (cpumask_empty(&allowed_cpu_mask))
			target_cpu = this_cpu;
		else
			target_cpu = cpumask_any(&allowed_cpu_mask);
		select_reason = LB_IRQ_BACKUP_ALLOWED;
	}

out:
#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	ts[7] = sched_clock();
#endif

	if (trace_sched_find_cpu_in_irq_enabled())
		trace_sched_find_cpu_in_irq(p, select_reason, target_cpu,
				prev_cpu, fit_cpus, idle_cpus,
				best_idle_cpu, best_idle_pwr, min_exit_lat,
				max_spare_cap_cpu, best_pwr, max_spare_cap);

#if IS_ENABLED(CONFIG_MTK_IRQ_MONITOR_DEBUG)
	ts[8] = sched_clock();

	if ((ts[8] - ts[0] > 1000000ULL) && in_hardirq()) {
		int i, i_prev;
		u64 prev, curr;

		printk_deferred("%s duration %llu, ts[0]=%llu\n", __func__, ts[8] - ts[0], ts[0]);
		i_prev = 0;
		for (i = 0; i < 8; i++) {
			if (ts[i+1]) {
				prev = ts[i_prev];
				curr = ts[i+1];
				printk_deferred("%s ts[%d]=%llu, ts[%d]=%llu, duration=%llu\n",
						__func__, i_prev, prev, i+1, curr, curr - prev);
				i_prev = i+1;
			}
		}
	}
#endif

	return target_cpu;
}

void mtk_find_energy_efficient_cpu(void *data, struct task_struct *p, int prev_cpu, int sync,
					int *new_cpu)
{
	unsigned long best_delta = ULONG_MAX;
	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
	int best_idle_cpu = -1;
	long sys_max_spare_cap = LONG_MIN, idle_max_spare_cap = LONG_MIN;
	int sys_max_spare_cap_cpu = -1;
	int idle_max_spare_cap_cpu = -1;
	unsigned long target_cap = 0;
	unsigned long cpu_cap, util;
	bool latency_sensitive = false;
	unsigned int min_exit_lat = UINT_MAX;
	int cpu, best_energy_cpu = -1;
	struct cpuidle_state *idle;
	struct perf_domain *pd;
	int select_reason = -1;
	unsigned long min_cap = uclamp_eff_value(p, UCLAMP_MIN);
	unsigned long max_cap = uclamp_eff_value(p, UCLAMP_MAX);

	rcu_read_lock();
	if (!uclamp_min_ls)
		latency_sensitive = uclamp_latency_sensitive(p);
	else {
		latency_sensitive = (p->uclamp_req[UCLAMP_MIN].value > 0 ? 1 : 0) ||
					uclamp_latency_sensitive(p);
	}

	if (!latency_sensitive)
		latency_sensitive = get_task_idle_prefer_by_task(p);

	pd = rcu_dereference(rd->pd);
	if (!pd || READ_ONCE(rd->overutilized)) {
		select_reason = LB_FAIL;
		goto unlock;
	}

	cpu = smp_processor_id();
	if (sync && cpu_rq(cpu)->nr_running == 1 &&
	    cpumask_test_cpu(cpu, p->cpus_ptr) &&
	    task_fits_capacity(p, capacity_of(cpu)) &&
	    !(latency_sensitive && !cpumask_test_cpu(cpu, &system_cpumask))) {
		rcu_read_unlock();
		*new_cpu = cpu;
		select_reason = LB_SYNC;
		goto done;
	}

	if (unlikely(in_interrupt())) {
		*new_cpu = mtk_find_energy_efficient_cpu_in_interrupt(p, latency_sensitive, pd,
					min_cap, max_cap);
		rcu_read_unlock();
		select_reason = LB_IN_INTERRUPT;
		goto done;
	}

	if (!task_util_est(p)) {
		select_reason = LB_ZERO_UTIL;
		goto unlock;
	}

	for (; pd; pd = pd->next) {
		unsigned long cur_delta;
		long spare_cap, max_spare_cap = LONG_MIN;
		unsigned long max_spare_cap_ls_idle = 0;
		int max_spare_cap_cpu = -1;
		int max_spare_cap_cpu_ls_idle = -1;
#if IS_ENABLED(CONFIG_MTK_THERMAL_AWARE_SCHEDULING)
		int cpu_order[NR_CPUS]  ____cacheline_aligned, cnt, i;
#endif

#if IS_ENABLED(CONFIG_MTK_THERMAL_AWARE_SCHEDULING)
		cnt = sort_thermal_headroom(perf_domain_span(pd), cpu_order);

		for (i = 0; i < cnt; i++) {
			cpu = cpu_order[i];
#else
		for_each_cpu_and(cpu, perf_domain_span(pd), cpu_active_mask) {
#endif

			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
				continue;

			if (cpu_paused(cpu))
				continue;

			if (cpu_rq(cpu)->rt.rt_nr_running >= 1 &&
						!rt_rq_throttled(&(cpu_rq(cpu)->rt)))
				continue;

			util = cpu_util_next(cpu, p, cpu);
			cpu_cap = capacity_of(cpu);
			spare_cap = cpu_cap;
			lsub_positive(&spare_cap, util);

			if ((spare_cap > sys_max_spare_cap) &&
			    !(latency_sensitive && !cpumask_test_cpu(cpu, &system_cpumask))) {
				sys_max_spare_cap = spare_cap;
				sys_max_spare_cap_cpu = cpu;
			}

			if (latency_sensitive && !cpumask_test_cpu(cpu, &system_cpumask))
				continue;

			/*
			 * if there is no best idle cpu, then select max spare cap
			 * and idle cpu for latency_sensitive task to avoid runnable.
			 * Because this is just a backup option, we do not take care
			 * of exit latency.
			 */
			if (latency_sensitive && idle_cpu(cpu) &&
					spare_cap > idle_max_spare_cap) {
				idle_max_spare_cap = spare_cap;
				idle_max_spare_cap_cpu = cpu;
			}

			/*
			 * Skip CPUs that cannot satisfy the capacity request.
			 * IOW, placing the task there would make the CPU
			 * overutilized. Take uclamp into account to see how
			 * much capacity we can get out of the CPU; this is
			 * aligned with effective_cpu_util().
			 */
			util = mtk_uclamp_rq_util_with(cpu_rq(cpu), util, p, min_cap, max_cap);
			if (!fits_capacity(util, cpu_cap))
				continue;

			/*
			 * Find the CPU with the maximum spare capacity in
			 * the performance domain
			 */
			if (spare_cap > max_spare_cap) {
				max_spare_cap = spare_cap;
				max_spare_cap_cpu = cpu;
			}

			if (!latency_sensitive)
				continue;

			if (idle_cpu(cpu)) {
				cpu_cap = capacity_orig_of(cpu);
				idle = idle_get_state(cpu_rq(cpu));
#if IS_ENABLED(CONFIG_MTK_THERMAL_AWARE_SCHEDULING)
				if (idle && idle->exit_latency >= min_exit_lat &&
						cpu_cap == target_cap)
					continue;
#else
				if (idle && idle->exit_latency > min_exit_lat &&
						cpu_cap == target_cap)
					continue;
#endif

				if (spare_cap < max_spare_cap_ls_idle)
					continue;

				if (idle)
					min_exit_lat = idle->exit_latency;

				max_spare_cap_ls_idle = spare_cap;
				target_cap = cpu_cap;
				max_spare_cap_cpu_ls_idle = cpu;
			}
		}

		/* Evaluate the energy impact of using this CPU. */
		if (!latency_sensitive && max_spare_cap_cpu >= 0) {
			cur_delta = mtk_compute_energy(p, max_spare_cap_cpu, pd, min_cap, max_cap);
			if (cur_delta <= best_delta) {
				best_delta = cur_delta;
				best_energy_cpu = max_spare_cap_cpu;
			}
		}

		if (latency_sensitive) {
			if (max_spare_cap_cpu_ls_idle >= 0) {
				cur_delta = mtk_compute_energy(p, max_spare_cap_cpu_ls_idle, pd,
							min_cap, max_cap);
				if (cur_delta <= best_delta) {
					best_delta = cur_delta;
					best_idle_cpu = max_spare_cap_cpu_ls_idle;
				}
			}
		}
	}

	rcu_read_unlock();

	if (latency_sensitive) {
		if (best_idle_cpu >= 0) {
			*new_cpu = best_idle_cpu;
			select_reason = LB_LATENCY_SENSITIVE_BEST_IDLE_CPU;
		} else if (idle_max_spare_cap_cpu >= 0) {
			*new_cpu = idle_max_spare_cap_cpu;
			select_reason = LB_LATENCY_SENSITIVE_IDLE_MAX_SPARE_CPU;
		} else {
			*new_cpu = sys_max_spare_cap_cpu;
			select_reason = LB_LATENCY_SENSITIVE_MAX_SPARE_CPU;
		}
		goto done;
	}

	/* All cpu failed on !fit_capacity, use sys_max_spare_cap_cpu */
	if (best_energy_cpu != -1) {
		*new_cpu = best_energy_cpu;
		select_reason = LB_BEST_ENERGY_CPU;
		goto done;
	} else {
		*new_cpu = sys_max_spare_cap_cpu;
		select_reason = LB_MAX_SPARE_CPU;
		goto done;
	}

	*new_cpu = prev_cpu;
	select_reason = LB_PREV;
	goto done;


unlock:
	rcu_read_unlock();

	*new_cpu = -1;
done:
	if (trace_sched_find_energy_efficient_cpu_enabled())
		trace_sched_find_energy_efficient_cpu(best_delta, best_energy_cpu,
				best_idle_cpu, idle_max_spare_cap_cpu, sys_max_spare_cap_cpu);
	if (trace_sched_select_task_rq_enabled())
		trace_sched_select_task_rq(p, select_reason, prev_cpu, *new_cpu,
				task_util(p), task_util_est(p), uclamp_task_util(p),
				latency_sensitive, sync);

}
#endif

#endif

#if IS_ENABLED(CONFIG_MTK_EAS)
/* must hold runqueue lock for queue se is currently on */
static struct task_struct *detach_a_hint_task(struct rq *src_rq, int dst_cpu)
{
	struct task_struct *p, *best_task = NULL, *backup = NULL;
	int dst_capacity;
	unsigned int task_util;
	bool latency_sensitive = false;

	lockdep_assert_rq_held(src_rq);

	rcu_read_lock();
	dst_capacity = capacity_orig_of(dst_cpu);
	list_for_each_entry_reverse(p,
			&src_rq->cfs_tasks, se.group_node) {

		if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr))
			continue;

		if (task_running(src_rq, p))
			continue;

		task_util = uclamp_task_util(p);

		if (!uclamp_min_ls)
			latency_sensitive = uclamp_latency_sensitive(p);
		else {
			latency_sensitive = (p->uclamp_req[UCLAMP_MIN].value > 0 ? 1 : 0) ||
					uclamp_latency_sensitive(p);
		}

		if (!latency_sensitive)
			latency_sensitive = get_task_idle_prefer_by_task(p);

		if (latency_sensitive && !cpumask_test_cpu(dst_cpu, &system_cpumask))
			continue;

		if (latency_sensitive &&
			task_util <= dst_capacity) {
			best_task = p;
			break;
		} else if (latency_sensitive && !backup) {
			backup = p;
		}
	}
	p = best_task ? best_task : backup;
	if (p) {
		/* detach_task */
		deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
		set_task_cpu(p, dst_cpu);
	}
	rcu_read_unlock();
	return p;
}
#endif

inline bool is_task_latency_sensitive(struct task_struct *p)
{
	bool latency_sensitive = false;

	rcu_read_lock();
	if (!uclamp_min_ls)
		latency_sensitive = uclamp_latency_sensitive(p);
	else {
		latency_sensitive = (p->uclamp_req[UCLAMP_MIN].value > 0 ? 1 : 0) ||
					uclamp_latency_sensitive(p);
	}
	if (!latency_sensitive)
		latency_sensitive = get_task_idle_prefer_by_task(p);

	rcu_read_unlock();

	return latency_sensitive;
}

static int mtk_active_load_balance_cpu_stop(void *data)
{
	struct task_struct *target_task = data;
	int busiest_cpu = smp_processor_id();
	struct rq *busiest_rq = cpu_rq(busiest_cpu);
	int target_cpu = busiest_rq->push_cpu;
	struct rq *target_rq = cpu_rq(target_cpu);
	struct rq_flags rf;
	int deactivated = 0;

	local_irq_disable();
	raw_spin_lock(&target_task->pi_lock);
	rq_lock(busiest_rq, &rf);

	if (task_cpu(target_task) != busiest_cpu ||
		(!cpumask_test_cpu(target_cpu, target_task->cpus_ptr)) ||
		task_running(busiest_rq, target_task) ||
		target_rq == busiest_rq)
		goto out_unlock;

	if (!task_on_rq_queued(target_task))
		goto out_unlock;

	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
		goto out_unlock;

	if (cpu_paused(busiest_cpu) || cpu_paused(target_cpu))
		goto out_unlock;

	/* Make sure the requested CPU hasn't gone down in the meantime: */
	if (unlikely(!busiest_rq->active_balance))
		goto out_unlock;

	/* Is there any task to move? */
	if (busiest_rq->nr_running <= 1)
		goto out_unlock;

	update_rq_clock(busiest_rq);
	deactivate_task(busiest_rq, target_task, DEQUEUE_NOCLOCK);
	set_task_cpu(target_task, target_cpu);
	deactivated = 1;
out_unlock:
	busiest_rq->active_balance = 0;
	rq_unlock(busiest_rq, &rf);

	if (deactivated)
		attach_one_task(target_rq, target_task);

	raw_spin_unlock(&target_task->pi_lock);
	put_task_struct(target_task);

	local_irq_enable();
	return 0;
}

int migrate_running_task(int this_cpu, struct task_struct *p, struct rq *target, int reason)
{
	int active_balance = false;
	unsigned long flags;

	raw_spin_rq_lock_irqsave(target, flags);
	if (!target->active_balance &&
		(task_rq(p) == target) && p->__state != TASK_DEAD &&
		 !(is_task_latency_sensitive(p) && !cpumask_test_cpu(this_cpu, &system_cpumask))) {
		target->active_balance = 1;
		target->push_cpu = this_cpu;
		active_balance = true;
		get_task_struct(p);
	}
	raw_spin_rq_unlock_irqrestore(target, flags);
	if (active_balance) {
		trace_sched_force_migrate(p, this_cpu, reason);
		stop_one_cpu_nowait(cpu_of(target),
				mtk_active_load_balance_cpu_stop,
				p, &target->active_balance_work);
	}

	return active_balance;
}

#if IS_ENABLED(CONFIG_MTK_EAS)
static DEFINE_PER_CPU(u64, next_update_new_balance_time_ns);
void mtk_sched_newidle_balance(void *data, struct rq *this_rq, struct rq_flags *rf,
		int *pulled_task, int *done)
{
	int cpu;
	struct rq *src_rq, *misfit_task_rq = NULL;
	struct task_struct *p = NULL, *best_running_task = NULL;
	struct rq_flags src_rf;
	int this_cpu = this_rq->cpu;
	unsigned long misfit_load = 0;
	u64 now_ns;

	if (cpu_paused(this_cpu)) {
		*done = 1;
		return;
	}

	/*
	 * There is a task waiting to run. No need to search for one.
	 * Return 0; the task will be enqueued when switching to idle.
	 */
	if (this_rq->ttwu_pending)
		return;

	/*
	 * We must set idle_stamp _before_ calling idle_balance(), such that we
	 * measure the duration of idle_balance() as idle time.
	 */
	this_rq->idle_stamp = rq_clock(this_rq);

	/*
	 * Do not pull tasks towards !active CPUs...
	 */
	if (!cpu_active(this_cpu))
		return;

	now_ns = ktime_get_real_ns();

	if (now_ns < per_cpu(next_update_new_balance_time_ns, this_cpu))
		return;

	per_cpu(next_update_new_balance_time_ns, this_cpu) =
		now_ns + new_idle_balance_interval_ns;

	trace_sched_next_new_balance(now_ns, per_cpu(next_update_new_balance_time_ns, this_cpu));

	/*
	 * This is OK, because current is on_cpu, which avoids it being picked
	 * for load-balance and preemption/IRQs are still disabled avoiding
	 * further scheduler activity on it and we're being very careful to
	 * re-start the picking loop.
	 */
	rq_unpin_lock(this_rq, rf);
	raw_spin_rq_unlock(this_rq);

	this_cpu = this_rq->cpu;
	for_each_cpu(cpu, cpu_active_mask) {
		if (cpu == this_cpu)
			continue;

		src_rq = cpu_rq(cpu);
		rq_lock_irqsave(src_rq, &src_rf);
		update_rq_clock(src_rq);
		if (src_rq->active_balance) {
			rq_unlock_irqrestore(src_rq, &src_rf);
			continue;
		}
		if (src_rq->misfit_task_load > misfit_load &&
			capacity_orig_of(this_cpu) > capacity_orig_of(cpu)) {
			p = src_rq->curr;
			if (p && p->policy == SCHED_NORMAL &&
				cpumask_test_cpu(this_cpu, p->cpus_ptr) &&
				!(is_task_latency_sensitive(p) &&
				!cpumask_test_cpu(this_cpu, &system_cpumask))) {

				misfit_task_rq = src_rq;
				misfit_load = src_rq->misfit_task_load;
				if (best_running_task)
					put_task_struct(best_running_task);
				best_running_task = p;
				get_task_struct(best_running_task);
			}
			p = NULL;
		}

		if (src_rq->nr_running <= 1) {
			rq_unlock_irqrestore(src_rq, &src_rf);
			continue;
		}

		p = detach_a_hint_task(src_rq, this_cpu);

		rq_unlock_irqrestore(src_rq, &src_rf);

		if (p) {
			trace_sched_force_migrate(p, this_cpu, MIGR_IDLE_BALANCE);
			attach_one_task(this_rq, p);
			break;
		}
	}

	/*
	 * If p is null meaning that we have not pull a runnable task, we try to
	 * pull a latency sensitive running task.
	 */
	if (!p && misfit_task_rq)
		*done = migrate_running_task(this_cpu, best_running_task,
					misfit_task_rq, MIGR_IDLE_PULL_MISFIT_RUNNING);
	if (best_running_task)
		put_task_struct(best_running_task);
	raw_spin_rq_lock(this_rq);
	/*
	 * While browsing the domains, we released the rq lock, a task could
	 * have been enqueued in the meantime. Since we're not going idle,
	 * pretend we pulled a task.
	 */
	if (this_rq->cfs.h_nr_running && !*pulled_task)
		*pulled_task = 1;

	/* Is there a task of a high priority class? */
	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
		*pulled_task = -1;

	if (*pulled_task)
		this_rq->idle_stamp = 0;

	if (*pulled_task != 0)
		*done = 1;

	rq_repin_lock(this_rq, rf);

}
#endif