diff -Naur linux-2.4.7/arch/i386/kernel/apm.c linux-2.4.7-mq/arch/i386/kernel/apm.c
--- linux-2.4.7/arch/i386/kernel/apm.c Fri Apr 6 17:42:47 2001
+++ linux-2.4.7-mq/arch/i386/kernel/apm.c Mon Aug 6 15:59:32 2001
@@ -1092,7 +1092,11 @@
* decide if we should just power down.
*
*/
+#ifdef CONFIG_SMP
+#define system_idle() (nr_running() == 1)
+#else
#define system_idle() (nr_running == 1)
+#endif
static void apm_mainloop(void)
{
diff -Naur linux-2.4.7/arch/i386/kernel/smpboot.c linux-2.4.7-mq/arch/i386/kernel/smpboot.c
--- linux-2.4.7/arch/i386/kernel/smpboot.c Tue Feb 13 22:13:43 2001
+++ linux-2.4.7-mq/arch/i386/kernel/smpboot.c Mon Aug 6 15:59:32 2001
@@ -562,13 +562,18 @@
if (!idle)
panic("No idle process for CPU %d", cpu);
- idle->processor = cpu;
x86_cpu_to_apicid[cpu] = apicid;
x86_apicid_to_cpu[apicid] = cpu;
- idle->has_cpu = 1; /* we schedule the first task manually */
idle->thread.eip = (unsigned long) start_secondary;
del_from_runqueue(idle);
+ /*
+ * Don't change processor/runqueue of task while it is
+ * still on runqueue.
+ */
+ idle->processor = cpu;
+ idle->has_cpu = 1; /* we schedule the first task manually */
+
unhash_process(idle);
init_tasks[cpu] = idle;
diff -Naur linux-2.4.7/fs/proc/proc_misc.c linux-2.4.7-mq/fs/proc/proc_misc.c
--- linux-2.4.7/fs/proc/proc_misc.c Sat Jul 7 18:43:24 2001
+++ linux-2.4.7-mq/fs/proc/proc_misc.c Mon Aug 6 15:59:32 2001
@@ -96,7 +96,11 @@
LOAD_INT(a), LOAD_FRAC(a),
LOAD_INT(b), LOAD_FRAC(b),
LOAD_INT(c), LOAD_FRAC(c),
+#ifdef CONFIG_SMP
+ nr_running(), nr_threads, last_pid);
+#else
nr_running, nr_threads, last_pid);
+#endif
return proc_calc_metrics(page, start, off, count, eof, len);
}
diff -Naur linux-2.4.7/include/linux/sched.h linux-2.4.7-mq/include/linux/sched.h
--- linux-2.4.7/include/linux/sched.h Fri Jul 20 19:52:18 2001
+++ linux-2.4.7-mq/include/linux/sched.h Mon Aug 6 22:31:00 2001
@@ -70,9 +70,15 @@
#define CT_TO_SECS(x) ((x) / HZ)
#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
-extern int nr_running, nr_threads;
+extern int nr_threads;
extern int last_pid;
+#ifdef CONFIG_SMP
+extern int nr_running(void);
+#else
+extern int nr_running;
+#endif
+
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/param.h>
@@ -131,13 +137,81 @@
#include <linux/spinlock.h>
/*
+ * aligned_data
+ * CPU specific scheduling data. One data item for each CPU
+ * in the system. Size should be a multiple of cache line size,
+ * and array of items should start on a cache line boundary.
+ */
+typedef union aligned_data {
+ struct schedule_data {
+ struct task_struct * curr; /* current task on this CPU */
+ cycles_t last_schedule; /* time of last scheduling */
+ /* decision */
+#ifdef CONFIG_SMP
+ int curr_na_goodness; /* non-affinity goodness */
+ /* value of current task */
+#endif
+ } schedule_data;
+ char __pad [SMP_CACHE_BYTES];
+} aligned_data_t;
+extern aligned_data_t aligned_data[];
+#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+#ifdef CONFIG_SMP
+#define curr_na_goodness(cpu) aligned_data[(cpu)].schedule_data.curr_na_goodness
+#endif
+#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+
+#ifdef CONFIG_SMP
+/*
+ * runqueue_data
+ * One runqueue per CPU in the system, plus one additional runqueue for
+ * realtime tasks. Size should be a multiple of cache line size, and array
+ * of items should start on a cache line boundary.
+ */
+typedef union runqueue_data {
+ struct rq_data {
+ int nt_running; /* # of tasks on runqueue */
+ int max_na_goodness; /* maximum non-affinity */
+ /* goodness value of */
+ /* 'schedulable' task */
+ /* on this runqueue */
+ struct task_struct * max_na_ptr; /* pointer to task which */
+ /* has max_na_goodness */
+ unsigned long max_na_cpus_allowed; /* copy of cpus_allowed */
+ /* field from task with */
+ /* max_na_goodness */
+ struct list_head runqueue; /* list of tasks on runqueue */
+ int running_non_idle; /* flag to indicate this cpu */
+ /* is running something */
+ /* besides the idle thread */
+ spinlock_t runqueue_lock; /* lock for this runqueue */
+ } rq_data;
+ char __pad [SMP_CACHE_BYTES];
+} runqueue_data_t;
+extern runqueue_data_t runqueue_data[];
+#define runqueue_lock(cpu) runqueue_data[(cpu)].rq_data.runqueue_lock
+#define runqueue(cpu) runqueue_data[(cpu)].rq_data.runqueue
+#define max_na_goodness(cpu) runqueue_data[(cpu)].rq_data.max_na_goodness
+#define max_na_cpus_allowed(cpu) \
+ runqueue_data[(cpu)].rq_data.max_na_cpus_allowed
+#define max_na_ptr(cpu) runqueue_data[(cpu)].rq_data.max_na_ptr
+#define nt_running(cpu) runqueue_data[(cpu)].rq_data.nt_running
+#define running_non_idle(cpu) runqueue_data[(cpu)].rq_data.running_non_idle
+
+#define REALTIME_RQ NR_CPUS
+#define NA_GOODNESS_INIT -10000
+#endif
+
+/*
* This serializes "schedule()" and also protects
* the run-queue from deletions/modifications (but
* _adding_ to the beginning of the run-queue has
* a separate lock).
*/
extern rwlock_t tasklist_lock;
+#ifndef CONFIG_SMP
extern spinlock_t runqueue_lock;
+#endif
extern spinlock_t mmlist_lock;
extern void sched_init(void);
@@ -436,6 +510,7 @@
#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
#define MAX_COUNTER (20*HZ/100)
#define DEF_NICE (0)
+#define ALL_CPUS_ALLOWED (-1)
/*
* INIT_TASK is used to set up the first task table, touch at
@@ -454,7 +529,7 @@
policy: SCHED_OTHER, \
mm: NULL, \
active_mm: &init_mm, \
- cpus_allowed: -1, \
+ cpus_allowed: ALL_CPUS_ALLOWED, \
run_list: LIST_HEAD_INIT(tsk.run_list), \
next_task: &tsk, \
prev_task: &tsk, \
@@ -838,18 +913,198 @@
#define next_thread(p) \
list_entry((p)->thread_group.next, struct task_struct, thread_group)
-static inline void del_from_runqueue(struct task_struct * p)
+static inline int task_on_runqueue(struct task_struct *p)
{
- nr_running--;
+ return (p->run_list.next != NULL);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * This function calculates the non-affinity goodness (na_goodness)
+ * of a task. This value is stored in the task struct while a task
+ * is on a runqueue.
+ */
+static inline int na_goodness(struct task_struct * p)
+{
+ int weight;
+
+ /*
+ * select the current process after every other
+ * runnable process, but before the idle thread.
+ * Also, dont trigger a counter recalculation.
+ */
+ weight = -1;
+ if (p->policy & SCHED_YIELD)
+ goto out;
+
+ /*
+ * Be sure to give sufficiently high boost to realtime tasks
+ */
+ if ((p->policy & ~SCHED_YIELD) != SCHED_OTHER) {
+ weight = 1000 + p->rt_priority;
+ goto out;
+ }
+
+ /*
+ * na_goodness is based on the nuber of ticks left.
+ * Don't do any other calculations if the time slice is
+ * over..
+ */
+ weight = p->counter;
+ if (!weight)
+ goto out;
+
+ /*
+ * Factor in the nice value
+ */
+ weight += 20 - p->nice;
+
+out:
+ return weight;
+}
+
+/*
+ * Determine runqueue associated with task
+ */
+static inline int task_to_runqueue(struct task_struct *t)
+{
+ int rq;
+
+ if ((t->policy & ~SCHED_YIELD) != SCHED_OTHER) {
+ rq = REALTIME_RQ;
+ } else {
+ rq = t->processor;
+ }
+
+ return(rq);
+}
+
+/*
+ * Lock runqueue associated with task
+ */
+static inline void lock_task_rq(struct task_struct *t)
+{
+ int rq = task_to_runqueue(t);
+
+ spin_lock(&runqueue_lock(rq));
+ while (task_to_runqueue(t) != rq) {
+ spin_unlock(&runqueue_lock(rq));
+ rq = task_to_runqueue(t);
+ spin_lock(&runqueue_lock(rq));
+ }
+}
+
+/*
+ * Unlock runqueue associated with task
+ */
+static inline void unlock_task_rq(struct task_struct *t)
+{
+ spin_unlock(&runqueue_lock(task_to_runqueue(t)));
+}
+
+/*
+ * Lock runqueue associated with task and disable interrupts
+ */
+static inline void lock_task_rq_irq(struct task_struct *t)
+{
+ int rq = t->processor;
+
+ spin_lock_irq(&runqueue_lock(rq));
+ while (t->processor != rq) {
+ spin_unlock_irq(&runqueue_lock(rq));
+ rq = t->processor;
+ spin_lock_irq(&runqueue_lock(rq));
+ }
+}
+
+/*
+ * Unlock runqueue associated with task and enable interrupts
+ */
+static inline void unlock_task_rq_irq(struct task_struct *t)
+{
+ spin_unlock_irq(&runqueue_lock(t->processor));
+}
+
+/*
+ * Common routine for both flavors of del_from_runqueue.
+ */
+static inline void del_from_runqueue_common(struct task_struct * p, int upd)
+{
+ int rq = task_to_runqueue(p);
+
+ nt_running(rq)--;
p->sleep_time = jiffies;
list_del(&p->run_list);
p->run_list.next = NULL;
+
+ if (max_na_ptr(rq) == p) {
+ if (upd) {
+ /*
+ * If we want to update max_na_* valies for the
+ * runqueue, then we scan the queue and look for
+ * the FIRST schedulable task. This is a 'good
+ * enough' approximation.
+ */
+ struct list_head *tmp;
+ struct task_struct *t, *tmp_task = NULL;
+ int weight, tmp_weight = 0;
+
+ list_for_each(tmp, &runqueue(rq)) {
+ t = list_entry(tmp, struct task_struct,
+ run_list);
+ if (!t->has_cpu) {
+ weight = na_goodness(t);
+ if (weight > tmp_weight) {
+ tmp_weight = weight;
+ tmp_task = t;
+ goto found_one;
+ }
+ }
+ }
+found_one:
+ if (tmp_weight) {
+ max_na_goodness(rq) = tmp_weight;
+ max_na_cpus_allowed(rq) =
+ tmp_task->cpus_allowed;
+ max_na_ptr(rq) = tmp_task;
+ } else {
+ max_na_goodness(rq) = NA_GOODNESS_INIT;
+ max_na_ptr(rq) = NULL;
+ }
+ } else {
+ max_na_goodness(rq) = NA_GOODNESS_INIT;
+ max_na_ptr(rq) = NULL;
+ }
+ }
}
-static inline int task_on_runqueue(struct task_struct *p)
+/*
+ * del_from_runqueue without updating max_na_* values. Used in
+ * places where we know we will updating these values before
+ * dropping the runqueue lock.
+ */
+static inline void del_from_runqueue(struct task_struct * p)
{
- return (p->run_list.next != NULL);
+ del_from_runqueue_common(p, 0);
}
+
+/*
+ * del_from_runqueue_update will update the max_na_* values
+ * if necessary.
+ */
+static inline void del_from_runqueue_update(struct task_struct * p)
+{
+ del_from_runqueue_common(p, 1);
+}
+#else
+static inline void del_from_runqueue(struct task_struct * p)
+{
+ nr_running--;
+ p->sleep_time = jiffies;
+ list_del(&p->run_list);
+ p->run_list.next = NULL;
+}
+#endif
static inline void unhash_process(struct task_struct *p)
{
diff -Naur linux-2.4.7/kernel/fork.c linux-2.4.7-mq/kernel/fork.c
--- linux-2.4.7/kernel/fork.c Wed Jul 18 01:23:28 2001
+++ linux-2.4.7-mq/kernel/fork.c Mon Aug 6 15:59:32 2001
@@ -27,7 +27,9 @@
/* The idle threads do not count.. */
int nr_threads;
+#ifndef CONFIG_SMP
int nr_running;
+#endif
int max_threads;
unsigned long total_forks; /* Handle normal Linux uptimes. */
diff -Naur linux-2.4.7/kernel/sched.c linux-2.4.7-mq/kernel/sched.c
--- linux-2.4.7/kernel/sched.c Wed Jul 18 01:30:50 2001
+++ linux-2.4.7-mq/kernel/sched.c Mon Aug 6 17:10:18 2001
@@ -78,33 +78,42 @@
/*
* The tasklist_lock protects the linked list of processes.
*
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
- *
* If both locks are to be concurrently held, the runqueue_lock
* nests inside the tasklist_lock.
*
* task->alloc_lock nests inside tasklist_lock.
*/
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
+#ifdef CONFIG_SMP
+/*
+ * There are NR_CPUS + 1 run queues. In addition to CPU specific
+ * runqueues there is a realtime runqueue so that any CPU can easily
+ * schedule realtime tasks. All CPUs can schedule tasks from the
+ * realtime runqueue (with appropriate locking of course).
+ * Initializatoin is performed in sched_init().
+ */
+runqueue_data_t runqueue_data [NR_CPUS+1] __cacheline_aligned;
+#else
+/*
+ * The run-queue lock locks the parts that actually access
+ * and change the run-queues, and have to be interrupt-safe.
+ */
+spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
static LIST_HEAD(runqueue_head);
+#endif
/*
* We align per-CPU scheduling data on cacheline boundaries,
* to prevent cacheline ping-pong.
*/
-static union {
- struct schedule_data {
- struct task_struct * curr;
- cycles_t last_schedule;
- } schedule_data;
- char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
-
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#ifdef CONFIG_SMP
+aligned_data_t aligned_data [NR_CPUS] __cacheline_aligned =
+ { {{&init_task,NA_GOODNESS_INIT,0}}};
+#else
+aligned_data_t aligned_data [NR_CPUS] __cacheline_aligned =
+ { {{&init_task,0}}};
+#endif
struct kernel_stat kstat;
@@ -113,6 +122,8 @@
#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
#define can_schedule(p,cpu) ((!(p)->has_cpu) && \
((p)->cpus_allowed & (1 << cpu)))
+#define local_can_schedule(p) (!(p)->has_cpu)
+#define this_cpu_allowed(ca, tcpu) ((ca) & (1 << tcpu))
#else
@@ -123,6 +134,50 @@
void scheduling_functions_start_here(void) { }
+#ifdef CONFIG_SMP
+/*
+ * Sum CPU specific nt_running fields to determine how many
+ * runnable tasks there are in the system. Replacement for
+ * the global nr_running variable.
+ */
+int
+nr_running(void) {
+ int i;
+ int tot=nt_running(REALTIME_RQ);
+
+ for(i=0; i<smp_num_cpus; i++) {
+ tot += nt_running(cpu_logical_map(i));
+ }
+
+ return(tot);
+}
+
+/*
+ * A stripped down version of the goodness routine for use on a CPU
+ * specific runqueue. This routine does not need to be concerned
+ * with realtime tasks, and does not need to take CPU affinity into
+ * account.
+ */
+static inline int local_goodness(struct task_struct * p, struct mm_struct *this_mm)
+{
+ int weight;
+
+ weight = -1;
+ if (p->policy & SCHED_YIELD)
+ goto local_out;
+
+ weight = p->counter;
+ if (!weight)
+ goto local_out;
+
+ if (p->mm == this_mm || !p->mm)
+ weight += 1;
+ weight += 20 - p->nice;
+local_out:
+ return weight;
+}
+#endif
+
/*
* This is the function that decides how desirable a process is..
* You can weigh different processes against each other depending
@@ -199,92 +254,213 @@
}
/*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
+ * Careful!
+ *
+ * This has to add the process to the _beginning_ of the
+ * run-queue, not the end. See the comment about "This is
+ * subtle" in the scheduler proper..
+ */
+static inline void add_to_runqueue(struct task_struct * p)
+{
+#ifdef CONFIG_SMP
+ int rq = task_to_runqueue(p);
+ int tsk_na_goodness = na_goodness(p);
+
+ list_add(&p->run_list, &runqueue(rq));
+
+ if (!p->has_cpu && (tsk_na_goodness > max_na_goodness(rq))) {
+ max_na_goodness(rq) = tsk_na_goodness;
+ max_na_cpus_allowed(rq) = p->cpus_allowed;
+ max_na_ptr(rq) = p;
+ }
+
+ nt_running(rq)++;
+#else
+ list_add(&p->run_list, &runqueue_head);
+ nr_running++;
+#endif
+}
+
+/*
+ * The runqueue lock must be held upon entry to this routine.
*/
static FASTCALL(void reschedule_idle(struct task_struct * p));
static void reschedule_idle(struct task_struct * p)
{
#ifdef CONFIG_SMP
- int this_cpu = smp_processor_id();
- struct task_struct *tsk, *target_tsk;
- int cpu, best_cpu, i, max_prio;
- cycles_t oldest_idle;
+ struct task_struct *tsk;
+ int target_cpu, this_cpu, tsk_cpu;
+ int i, cpu;
+ int need_resched;
+ cycles_t curr_cycles, tmp_cycles;
+ int stack_list[NR_CPUS];
+ int saved_na_goodness, tmp_min_na_goodness;
+
+ tsk_cpu = p->processor;
+ this_cpu = smp_processor_id();
+ /*
+ * First check if the task's previous CPU is idle, use it if it is.
+ */
+ if (can_schedule(p, tsk_cpu) &&
+ (cpu_curr(tsk_cpu) == idle_task(tsk_cpu))) {
+ if (!task_on_runqueue(p)) {
+ add_to_runqueue(p);
+ }
+ tsk = cpu_curr(tsk_cpu);
+ need_resched = tsk->need_resched;
+ tsk->need_resched = 1;
+ if ((tsk_cpu != this_cpu) && !need_resched) {
+ smp_send_reschedule(tsk_cpu);
+ }
+ return;
+ }
/*
- * shortcut if the woken up task's last CPU is
- * idle now.
- */
- best_cpu = p->processor;
- if (can_schedule(p, best_cpu)) {
- tsk = idle_task(best_cpu);
- if (cpu_curr(best_cpu) == tsk) {
- int need_resched;
-send_now_idle:
+ * Create a list of current na_goodness values on our stack.
+ * Only values less than the non-affinity goodness value of
+ * p should be considered for preemption.
+ */
+ saved_na_goodness = na_goodness(p) - 1; /* preemption_goodness() > 1 */
+ tmp_min_na_goodness = saved_na_goodness;
+ curr_cycles = get_cycles();
+ target_cpu = -1;
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpu = cpu_logical_map(i);
+
+ if (!can_schedule(p, cpu)) {
+ stack_list[cpu] = saved_na_goodness;
+ continue;
+ }
+
+ if (curr_na_goodness(cpu) == NA_GOODNESS_INIT) {
/*
- * If need_resched == -1 then we can skip sending
- * the IPI altogether, tsk->need_resched is
- * actively watched by the idle thread.
+ * Indicates an idle task. For idle tasks, determine
+ * the amount of time they have been idle. Use the
+ * negative of this value in the list. Hence, we
+ * first choose the CPU that has been idle the longest.
*/
- need_resched = tsk->need_resched;
- tsk->need_resched = 1;
- if ((best_cpu != this_cpu) && !need_resched)
- smp_send_reschedule(best_cpu);
- return;
+ tmp_cycles = curr_cycles - last_schedule(cpu);
+ if (tmp_cycles > INT_MAX) {
+ stack_list[cpu] = INT_MIN;
+ } else {
+ stack_list[cpu] = (int)-tmp_cycles;
+ }
+ } else {
+ stack_list[cpu] = curr_na_goodness(cpu);
+ /*
+ * Add in PROC_CHANGE_PENALTY for remote CPUs
+ */
+ if (cpu != tsk_cpu) {
+ stack_list[cpu] += PROC_CHANGE_PENALTY;
+ }
+ }
+
+ /*
+ * Look for the lowest value
+ */
+ if (stack_list[cpu] < tmp_min_na_goodness) {
+ target_cpu = cpu;
+ tmp_min_na_goodness = stack_list[cpu];
}
}
/*
- * We know that the preferred CPU has a cache-affine current
- * process, lets try to find a new idle CPU for the woken-up
- * process. Select the least recently active idle CPU. (that
- * one will have the least active cache context.) Also find
- * the executing process which has the least priority.
- */
- oldest_idle = (cycles_t) -1;
- target_tsk = NULL;
- max_prio = 1;
+ * We try to add the task to a runqueue starting with the one
+ * that has the lowest na_goodness value.
+ */
+ while (target_cpu != -1) {
+ if (target_cpu == tsk_cpu &&
+ preemption_goodness((tsk = cpu_curr(target_cpu)),
+ p, target_cpu) > 1) {
+ /*
+ * If target_cpu is tsk_cpu, then no additional
+ * locking is required (we already have the CPU
+ * specific runqueue locked). We also know that
+ * this CPU can not be idle, otherwise the fast
+ * path at the beginning of this routine would
+ * have been executed. Therefore, simply send
+ * the IPI if required.
+ */
+ if (!task_on_runqueue(p)) {
+ add_to_runqueue(p);
+ }
+ tsk = cpu_curr(target_cpu);
+ tsk->need_resched = 1;
+ if (target_cpu != this_cpu) {
+ smp_send_reschedule(target_cpu);
+ }
+ return;
+ }
- for (i = 0; i < smp_num_cpus; i++) {
- cpu = cpu_logical_map(i);
- if (!can_schedule(p, cpu))
- continue;
- tsk = cpu_curr(cpu);
/*
- * We use the first available idle CPU. This creates
- * a priority list between idle CPUs, but this is not
- * a problem.
+ * Try to lock runqueue and verify na_goodness value.
*/
- if (tsk == idle_task(cpu)) {
- if (last_schedule(cpu) < oldest_idle) {
- oldest_idle = last_schedule(cpu);
- target_tsk = tsk;
- }
- } else {
- if (oldest_idle == -1ULL) {
- int prio = preemption_goodness(tsk, p, cpu);
+ else if (spin_trylock(&runqueue_lock(target_cpu))) {
+ tsk = cpu_curr(target_cpu);
+ if ((tsk == idle_task(target_cpu)) ||
+ (preemption_goodness(tsk, p, target_cpu) > 1)) {
+ /*
+ * Target CPU is idle, or it is running
+ * a task with lower priority than p.
+ * Therefore, move p to target runqueue.
+ */
+ if (task_on_runqueue(p)) {
+ del_from_runqueue_update(p);
+ }
+ p->processor = target_cpu;
+ add_to_runqueue(p);
- if (prio > max_prio) {
- max_prio = prio;
- target_tsk = tsk;
+ /*
+ * Send an IPI to target CPU, unless the
+ * CPU is idle and the need_resched flag
+ * has already been set.
+ */
+ need_resched = tsk->need_resched;
+ tsk->need_resched = 1;
+ if ((target_cpu != this_cpu) &&
+ ((tsk != idle_task(target_cpu)) ||
+ !need_resched)){
+ smp_send_reschedule(target_cpu);
}
+
+ spin_unlock(&runqueue_lock(target_cpu));
+
+ return;
}
+ spin_unlock(&runqueue_lock(target_cpu));
}
- }
- tsk = target_tsk;
- if (tsk) {
- if (oldest_idle != -1ULL) {
- best_cpu = tsk->processor;
- goto send_now_idle;
+
+ /*
+ * Update list value so we don't check this CPU again.
+ */
+ stack_list[target_cpu] = saved_na_goodness;
+
+ /*
+ * Find the 'next lowest' cur_na_goodness value.
+ */
+ target_cpu = -1;
+ tmp_min_na_goodness = saved_na_goodness;
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpu = cpu_logical_map(i);
+ if (stack_list[cpu] < tmp_min_na_goodness) {
+ target_cpu = cpu;
+ tmp_min_na_goodness = stack_list[cpu];
+ }
}
- tsk->need_resched = 1;
- if (tsk->processor != this_cpu)
- smp_send_reschedule(tsk->processor);
+ }
+
+ /*
+ * If we get here, it means that the best place for the task is
+ * on its currently assigned runqueue. Also, we know that the
+ * task currently running on the this tasks runqueue has sufficuent
+ * priority to prevent preemption. Hence, we simply ensure the
+ * task is on the runqueue.
+ */
+ if (!task_on_runqueue(p)) {
+ add_to_runqueue(p);
}
return;
-
#else /* UP */
int this_cpu = smp_processor_id();
@@ -296,29 +472,46 @@
#endif
}
+#ifdef CONFIG_SMP
/*
- * Careful!
- *
- * This has to add the process to the _beginning_ of the
- * run-queue, not the end. See the comment about "This is
- * subtle" in the scheduler proper..
+ * Lock runqueue associated with task's CPU and save irq state
+ * NOTE that this is different than locking the runqueue associated
+ * with the task (specifically for realtime tasks).
*/
-static inline void add_to_runqueue(struct task_struct * p)
+static inline int lock_task_cpu_rq_irqsave(struct task_struct *t,
+ unsigned long *flags)
{
- list_add(&p->run_list, &runqueue_head);
- nr_running++;
+ int rq = t->processor;
+
+ spin_lock_irqsave(&runqueue_lock(rq), *flags);
+ while (t->processor != rq) {
+ spin_unlock_irqrestore(&runqueue_lock(rq), *flags);
+ rq = t->processor;
+ spin_lock_irqsave(&runqueue_lock(rq), *flags);
+ }
+
+ return(rq);
}
+#endif
static inline void move_last_runqueue(struct task_struct * p)
{
list_del(&p->run_list);
+#ifdef CONFIG_SMP
+ list_add_tail(&p->run_list, &runqueue(task_to_runqueue(p)));
+#else
list_add_tail(&p->run_list, &runqueue_head);
+#endif
}
static inline void move_first_runqueue(struct task_struct * p)
{
list_del(&p->run_list);
+#ifdef CONFIG_SMP
+ list_add(&p->run_list, &runqueue(task_to_runqueue(p)));
+#else
list_add(&p->run_list, &runqueue_head);
+#endif
}
/*
@@ -333,20 +526,47 @@
{
unsigned long flags;
int success = 0;
+#ifdef CONFIG_SMP
+ int rq;
+
+ rq = lock_task_cpu_rq_irqsave(p, &flags);
+ if (task_to_runqueue(p) == REALTIME_RQ) {
+ /*
+ * Indicates p is a realtime task. We must
+ * also lock the realtime runqueue.
+ */
+ spin_lock(&runqueue_lock(REALTIME_RQ));
+ }
+#else
+ spin_lock_irqsave(&runqueue_lock, flags);
+#endif
/*
* We want the common case fall through straight, thus the goto.
*/
- spin_lock_irqsave(&runqueue_lock, flags);
p->state = TASK_RUNNING;
if (task_on_runqueue(p))
goto out;
+#ifndef CONFIG_SMP
add_to_runqueue(p);
if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
reschedule_idle(p);
+#else
+ if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
+ reschedule_idle(p);
+ else
+ add_to_runqueue(p);
+#endif
success = 1;
out:
+#ifdef CONFIG_SMP
+ if (task_to_runqueue(p) == REALTIME_RQ) {
+ spin_unlock(&runqueue_lock(REALTIME_RQ));
+ }
+ spin_unlock_irqrestore(&runqueue_lock(rq), flags);
+#else
spin_unlock_irqrestore(&runqueue_lock, flags);
+#endif
return success;
}
@@ -449,6 +669,7 @@
{
#ifdef CONFIG_SMP
int policy;
+ int rq;
/*
* prev->policy can be written from here only before `prev'
@@ -501,15 +722,27 @@
(policy & SCHED_YIELD))
goto out_unlock;
- spin_lock_irqsave(&runqueue_lock, flags);
+ rq = lock_task_cpu_rq_irqsave(prev, &flags);
+ if (task_to_runqueue(prev) == REALTIME_RQ) {
+ /*
+ * Indicates prev is a realtime task. We must
+ * also lock the realtime runqueue.
+ */
+ spin_lock(&runqueue_lock(REALTIME_RQ));
+ }
+
if ((prev->state == TASK_RUNNING) && !prev->has_cpu)
reschedule_idle(prev);
- spin_unlock_irqrestore(&runqueue_lock, flags);
+
+ if (task_to_runqueue(prev) == REALTIME_RQ) {
+ spin_unlock(&runqueue_lock(REALTIME_RQ));
+ }
+ spin_unlock_irqrestore(&runqueue_lock(rq), flags);
goto out_unlock;
}
#else
prev->policy &= ~SCHED_YIELD;
-#endif /* CONFIG_SMP */
+#endif
}
void schedule_tail(struct task_struct *prev)
@@ -518,10 +751,7 @@
}
/*
- * 'schedule()' is the scheduler function. It's a very simple and nice
- * scheduler: it's not perfect, but certainly works for most things.
- *
- * The goto is "interesting".
+ * 'schedule()' is the scheduler function.
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
@@ -529,28 +759,42 @@
*/
asmlinkage void schedule(void)
{
- struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;
+#ifdef CONFIG_SMP
+ struct task_struct *prev_next;
+ int tmp_na_goodness, prev_next_weight;
+ int prev_rq, rrq, rq, rcpu;
+ int stack_list[NR_CPUS];
+ int premature_idle;
+#endif
if (!current->active_mm) BUG();
need_resched_back:
prev = current;
this_cpu = prev->processor;
+#ifdef CONFIG_SMP
+ prev_rq = task_to_runqueue(prev);
+#endif
if (in_interrupt())
goto scheduling_in_interrupt;
release_kernel_lock(prev, this_cpu);
- /*
- * 'sched_data' is protected by the fact that we can run
- * only one process per CPU.
- */
- sched_data = & aligned_data[this_cpu].schedule_data;
-
+#ifdef CONFIG_SMP
+ spin_lock_irq(&runqueue_lock(this_cpu));
+ if (prev_rq != this_cpu) {
+ /*
+ * Indicates curent is a realtime task. We must also
+ * lock the realtime runqueue.
+ */
+ spin_lock(&runqueue_lock(REALTIME_RQ));
+ }
+#else
spin_lock_irq(&runqueue_lock);
+#endif
/* move an exhausted RR process to be last.. */
if (prev->policy == SCHED_RR)
@@ -579,10 +823,15 @@
*/
next = idle_task(this_cpu);
c = -1000;
+#ifdef CONFIG_SMP
+ prev_next = next;
+ prev_next_weight = c;
+#endif
if (prev->state == TASK_RUNNING)
goto still_running;
still_running_back:
+#ifndef CONFIG_SMP
list_for_each(tmp, &runqueue_head) {
p = list_entry(tmp, struct task_struct, run_list);
if (can_schedule(p, this_cpu)) {
@@ -591,21 +840,212 @@
c = weight, next = p;
}
}
+#else
+ /*
+ * There exists a separate realtime runqueue for all realtime
+ * tasks. Any CPU can schedule a task from the realtime runqueue.
+ *
+ * We scan the realtime runqueue if either,
+ * - The current task is a realtime task. Hence, the realtime
+ * run-queue is already locked.
+ * - There is a task on the realtime runqueue. In this case we
+ * must acquire the realtime runqueue lock here.
+ * Otherwise, we skip the scan of the realtime runqueue.
+ */
+ if (prev_rq == REALTIME_RQ || nt_running(REALTIME_RQ)) {
+ goto scan_rt_queue;
+ }
+scan_rt_queue_back:
+
+ /*
+ * Search CPU specific runqueue
+ */
+ list_for_each(tmp, &runqueue(this_cpu)) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (local_can_schedule(p)) {
+ int weight = local_goodness(p, prev->active_mm);
+ if (weight > c) {
+ if (!next->has_cpu) {
+ /*
+ * prev_next must point to a
+ * schedulable task.
+ */
+ prev_next_weight = c;
+ prev_next = next;
+ }
+ c = weight;
+ next = p;
+ } else if (weight > prev_next_weight) {
+ prev_next_weight = weight;
+ prev_next = p;
+ }
+ }
+ }
+
+ /*
+ * If next task is realtime, there is no need to look at other
+ * runqueues. Simply set max_na_goodness values appropriately.
+ */
+ if (task_to_runqueue(next) == REALTIME_RQ) {
+ goto set_max_na_goodness;
+ }
+
+ /*
+ * As calculated above, c does not contain a CPU affinity boost.
+ * We must add this boost before comparing to tasks on other
+ * runqueues. Only add PROC_CHANGE_PENALTY if c is a positive
+ * goodness value.
+ */
+ if (c > 0) {
+ c += PROC_CHANGE_PENALTY;
+ }
+
+ /*
+ * Copy max_na_goodness values from CPU specific runqueue
+ * structures to the list on our stack.
+ */
+scan_rmt_queues:
+ premature_idle = 0;
+ rrq = -1;
+ tmp_na_goodness = c;
+ for (rq = 0; rq < smp_num_cpus; rq++) {
+ rcpu = cpu_logical_map(rq);
+ if (rcpu == this_cpu) {
+ stack_list[rcpu] = NA_GOODNESS_INIT;
+ continue;
+ }
+ if (!this_cpu_allowed(max_na_cpus_allowed(rcpu), this_cpu)) {
+ stack_list[rcpu] = NA_GOODNESS_INIT;
+ continue;
+ }
+ if (max_na_goodness(rcpu) <= c) {
+ stack_list[rcpu] = NA_GOODNESS_INIT;
+ continue;
+ }
+
+ stack_list[rcpu] = max_na_goodness(rcpu);
+ if (stack_list[rcpu] > tmp_na_goodness) {
+ rrq = rcpu;
+ tmp_na_goodness = stack_list[rcpu];
+ }
+ }
+
+ /*
+ * Now use the values from the stack list to search for a
+ * task to steal.
+ */
+ while (rrq != -1) {
+ /*
+ * First try to lock the remote runqueue and verify
+ * the max_na_goodness value.
+ */
+ if (spin_trylock(&runqueue_lock(rrq))) {
+ if (max_na_goodness(rrq) > c &&
+ this_cpu_allowed(max_na_cpus_allowed(rrq),
+ this_cpu)) {
+ /*
+ * Move a remote task to our runqueue
+ */
+ if (!next->has_cpu) {
+ prev_next = next;
+ }
+ next = max_na_ptr(rrq);
+ c = max_na_goodness(rrq);
+ del_from_runqueue_update(next);
+ next->processor = this_cpu;
+ add_to_runqueue(next);
+
+ /*
+ * We have stolen a task from another
+ * runqueue, quit looking.
+ */
+ spin_unlock(&runqueue_lock(rrq));
+ break;
+ }
+ spin_unlock(&runqueue_lock(rrq));
+ } else {
+ premature_idle++;
+ }
+
+ /*
+ * We were either unable to get the remote runqueue lock,
+ * or the remote runqueue has changed such that it is no
+ * longer desirable to steal a task from the queue.
+ *
+ * Go to the runqueue with the 'next highest' max_na_goodness
+ * value.
+ */
+ stack_list[rrq] = NA_GOODNESS_INIT;
+ tmp_na_goodness = c;
+ rrq = -1;
+ for (rq = 0; rq < smp_num_cpus; rq++) {
+ rcpu = cpu_logical_map(rq);
+ if (stack_list[rcpu] > tmp_na_goodness) {
+ rrq = rcpu;
+ tmp_na_goodness = stack_list[rcpu];
+ }
+ }
+ }
+
+ /*
+ * Check for going idle prematurely. If this is the case, there
+ * is a good chance there are schedulable tasks on other runquueues.
+ */
+ if ((next == idle_task(this_cpu)) && premature_idle) {
+ /*
+ * Be sure to clear max_na_goodness, otherwise there
+ * is the potential for deadlock.
+ */
+ max_na_goodness(this_cpu) = NA_GOODNESS_INIT;
+ goto scan_rmt_queues;
+ }
+#endif
/* Do we need to re-calculate counters? */
if (!c)
goto recalculate;
+
+#ifdef CONFIG_SMP
+set_max_na_goodness:
+ /*
+ * Make sure max_na_* values for runqueue are updated
+ */
+ if (prev_next != idle_task(this_cpu)) {
+ max_na_goodness(this_cpu) = na_goodness(prev_next);
+ max_na_cpus_allowed(this_cpu) = prev_next->cpus_allowed;
+ max_na_ptr(this_cpu) = prev_next;
+ } else {
+ max_na_goodness(this_cpu) = NA_GOODNESS_INIT;
+ /* max_na_cpus_allowed need not be set */
+ max_na_ptr(this_cpu) = NULL;
+ }
+
+ /*
+ * Update scheduling fields in next task structure.
+ */
+ next->has_cpu = 1;
+found_next:
+ next->processor = this_cpu;
+
+#endif
/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
* sched_data.
*/
- sched_data->curr = next;
+ cpu_curr(this_cpu) = next;
#ifdef CONFIG_SMP
- next->has_cpu = 1;
- next->processor = this_cpu;
-#endif
+ if (next != idle_task(this_cpu)) {
+ curr_na_goodness(this_cpu) = na_goodness(next);
+ running_non_idle(this_cpu) = 1;
+ } else {
+ curr_na_goodness(this_cpu) = NA_GOODNESS_INIT;
+ running_non_idle(this_cpu) = 0;
+ }
+ spin_unlock_irq(&runqueue_lock(this_cpu));
+#else
spin_unlock_irq(&runqueue_lock);
+#endif
if (prev == next) {
/* We won't go through the normal tail, so do this by hand */
@@ -621,7 +1061,7 @@
* and it's approximate, so we do not have to maintain
* it while holding the runqueue spinlock.
*/
- sched_data->last_schedule = get_cycles();
+ last_schedule(this_cpu) = get_cycles();
/*
* We drop the scheduler lock early (it's a global spinlock),
@@ -629,7 +1069,7 @@
* rescheduled during switch_to().
*/
-#endif /* CONFIG_SMP */
+#endif
kstat.context_swtch++;
/*
@@ -678,18 +1118,42 @@
recalculate:
{
struct task_struct *p;
+#ifdef CONFIG_SMP
+ spin_unlock_irq(&runqueue_lock(this_cpu));
+#else
spin_unlock_irq(&runqueue_lock);
+#endif
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
read_unlock(&tasklist_lock);
+#ifdef CONFIG_SMP
+ spin_lock_irq(&runqueue_lock(this_cpu));
+#else
spin_lock_irq(&runqueue_lock);
+#endif
}
goto repeat_schedule;
still_running:
- if (!(prev->cpus_allowed & (1UL << this_cpu)))
+#ifdef CONFIG_SMP
+ if (!(prev->cpus_allowed & (1UL << this_cpu))) {
+ unsigned long allowed = prev->cpus_allowed;
+ /*
+ * task is no longer runnable on this CPU. We remove it
+ * from the runqueue to ensure that it will not be considered
+ * for scheduling here. Let schedule_tail take care of
+ * sending it off to an appropriate CPU.
+ */
+ del_from_runqueue(prev);
+ prev->processor = 0;
+ while (!(allowed & 1UL)) {
+ prev->processor++;
+ allowed = allowed >> 1;
+ }
goto still_running_back;
+ }
+#endif
c = goodness(prev, this_cpu, prev->active_mm);
next = prev;
goto still_running_back;
@@ -701,6 +1165,54 @@
}
goto move_rr_back;
+#ifdef CONFIG_SMP
+scan_rt_queue:
+ if (prev_rq != REALTIME_RQ) {
+ spin_lock(&runqueue_lock(REALTIME_RQ));
+ }
+ /*
+ * Scan the queue. Note that there is no need to
+ * keep track of max_na_goodness values for the
+ * realtime runqueue, as they are never used in
+ * scheduling decisions.
+ */
+ list_for_each(tmp, &runqueue(REALTIME_RQ)) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (can_schedule(p, this_cpu)) {
+ int weight = 1000 + p->rt_priority;
+ if (weight > c)
+ c = weight, next = p;
+ }
+ }
+
+ /*
+ * Unlock the realtime runqueue before continuing. If we
+ * selected a realtime task to execute, be sure to make it
+ * non-schedulable (set has_cpu) before releasing the lock.
+ * Note that we still hold the CPU specific runqueue lock.
+ */
+ if (next != idle_task(this_cpu)) {
+ next->has_cpu = 1;
+ }
+ spin_unlock(&runqueue_lock(REALTIME_RQ));
+
+ /*
+ * If we find a realtime task to schedule we are mostly done.
+ * However, the max_na_goodness value of this CPU's runqueue
+ * may need to be set to an appropriate value (so that other
+ * CPUs can steal tasks while this CPU runs the realtime task).
+ * If necessary, make a pass through the CPU specific runqueue
+ * to set the value.
+ */
+ if ((task_to_runqueue(next) == REALTIME_RQ) &&
+ ((max_na_goodness(this_cpu) != NA_GOODNESS_INIT) ||
+ !nt_running(this_cpu)) ) {
+ goto found_next;
+ }
+
+ goto scan_rt_queue_back;
+#endif
+
scheduling_in_interrupt:
printk("Scheduling in interrupt\n");
BUG();
@@ -909,6 +1421,8 @@
struct sched_param lp;
struct task_struct *p;
int retval;
+ int lock_rt = 0;
+ int was_on_rq = 0;
retval = -EINVAL;
if (!param || pid < 0)
@@ -918,18 +1432,31 @@
if (copy_from_user(&lp, param, sizeof(struct sched_param)))
goto out_nounlock;
+ retval = -ESRCH;
+#ifdef CONFIG_SMP
+ /*
+ * Note that here we get the tasklist lock so that we can
+ * find the task struct. Not until we have access to the
+ * task struct can we determine what runqueue to lock.
+ */
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ goto out_nounlock;
+ }
+ lock_task_rq_irq(p);
+#else
/*
* We play safe to avoid deadlocks.
*/
read_lock_irq(&tasklist_lock);
spin_lock(&runqueue_lock);
-
p = find_process_by_pid(pid);
-
- retval = -ESRCH;
if (!p)
goto out_unlock;
-
+#endif
+
if (policy < 0)
policy = p->policy;
else {
@@ -958,16 +1485,49 @@
goto out_unlock;
retval = 0;
+
+#ifdef CONFIG_SMP
+ if ((p->policy != SCHED_OTHER) || (policy != SCHED_OTHER)) {
+ /*
+ * If changing to/from a realtime policy, OR simply
+ * changing the priority of a realtime task, we must
+ * lock the realtime runqueue.
+ */
+ spin_lock(&runqueue_lock(REALTIME_RQ));
+ lock_rt = 1;
+ }
+
+ if (task_on_runqueue(p)) {
+ del_from_runqueue_update(p);
+ was_on_rq = 1;
+ }
+ p->policy = policy;
+ p->rt_priority = lp.sched_priority;
+ if (was_on_rq) {
+ add_to_runqueue(p);
+ }
+ current->need_resched = 1;
+
+ if (lock_rt) {
+ spin_unlock(&runqueue_lock(REALTIME_RQ));
+ }
+#else
p->policy = policy;
p->rt_priority = lp.sched_priority;
if (task_on_runqueue(p))
move_first_runqueue(p);
current->need_resched = 1;
+#endif
out_unlock:
+#ifdef CONFIG_SMP
+ unlock_task_rq_irq(p);
+ read_unlock(&tasklist_lock);
+#else
spin_unlock(&runqueue_lock);
read_unlock_irq(&tasklist_lock);
+#endif
out_nounlock:
return retval;
@@ -1044,19 +1604,18 @@
* to be atomic.) In threaded applications this optimization
* gets triggered quite often.
*/
-
- int nr_pending = nr_running;
-
#if CONFIG_SMP
+ int nr_pending = nt_running(REALTIME_RQ);
int i;
// Substract non-idle processes running on other CPUs.
for (i = 0; i < smp_num_cpus; i++) {
int cpu = cpu_logical_map(i);
- if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
- nr_pending--;
+ nr_pending += (nt_running(cpu) - running_non_idle(cpu));
}
#else
+ int nr_pending = nr_running;
+
// on UP this process is on the runqueue as well
nr_pending--;
#endif
@@ -1270,6 +1829,21 @@
*/
int cpu = smp_processor_id();
int nr;
+
+#ifdef CONFIG_SMP
+ /*
+ * Initialize the scheduling data structures
+ */
+ for (nr = 0; nr < NR_CPUS+1; nr++) {
+ runqueue_lock(nr) = SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&runqueue(nr));
+ max_na_goodness(nr) = NA_GOODNESS_INIT;
+ /* max_na_cpus_allowed need not be initialized */
+ max_na_ptr(nr) = NULL;
+ nt_running(nr) = 0;
+ running_non_idle(nr) = 0;
+ }
+#endif
init_task.processor = cpu;
diff -Naur linux-2.4.7/kernel/signal.c linux-2.4.7-mq/kernel/signal.c
--- linux-2.4.7/kernel/signal.c Thu Jan 4 04:45:26 2001
+++ linux-2.4.7-mq/kernel/signal.c Mon Aug 6 15:59:32 2001
@@ -483,10 +483,10 @@
* process of changing - but no harm is done by that
* other than doing an extra (lightweight) IPI interrupt.
*/
- spin_lock(&runqueue_lock);
+ lock_task_rq(t);
if (t->has_cpu && t->processor != smp_processor_id())
smp_send_reschedule(t->processor);
- spin_unlock(&runqueue_lock);
+ unlock_task_rq(t);
#endif /* CONFIG_SMP */
}
diff -Naur linux-2.4.7/kernel/timer.c linux-2.4.7-mq/kernel/timer.c
--- linux-2.4.7/kernel/timer.c Tue Jun 12 23:40:11 2001
+++ linux-2.4.7-mq/kernel/timer.c Mon Aug 6 15:59:32 2001
@@ -587,6 +587,11 @@
p->counter = 0;
p->need_resched = 1;
}
+#ifdef CONFIG_SMP
+ if (curr_na_goodness(cpu) != NA_GOODNESS_INIT) {
+ curr_na_goodness(cpu) = na_goodness(p);
+ }
+#endif
if (p->nice > 0)
kstat.per_cpu_nice[cpu] += user_tick;
else