patch

diff -Naur linux-2.4.14/arch/i386/kernel/apm.c linux-2.4.14-mq/arch/i386/kernel/apm.c
--- linux-2.4.14/arch/i386/kernel/apm.c	Fri Oct 19 15:32:28 2001
+++ linux-2.4.14-mq/arch/i386/kernel/apm.c	Tue Nov  6 22:19:27 2001
@@ -1342,7 +1342,7 @@
  * decide if we should just power down.
  *
  */
-#define system_idle() (nr_running == 1)
+#define system_idle() (nr_running() == 1)
 
 static void apm_mainloop(void)
 {
diff -Naur linux-2.4.14/arch/i386/kernel/smpboot.c linux-2.4.14-mq/arch/i386/kernel/smpboot.c
--- linux-2.4.14/arch/i386/kernel/smpboot.c	Fri Oct  5 01:42:54 2001
+++ linux-2.4.14-mq/arch/i386/kernel/smpboot.c	Tue Nov  6 22:19:27 2001
@@ -799,14 +799,19 @@
 	if (!idle)
 		panic("No idle process for CPU %d", cpu);
 
-	idle->processor = cpu;
 
 	map_cpu_to_boot_apicid(cpu, apicid);
 
-	idle->has_cpu = 1; /* we schedule the first task manually */
 	idle->thread.eip = (unsigned long) start_secondary;
 
 	del_from_runqueue(idle);
+	/*
+	 * Don't change processor/runqueue of task while it is
+	 * still on runqueue.
+	 */
+	idle->processor = cpu;
+	idle->has_cpu = 1; /* we schedule the first task manually */
+
 	unhash_process(idle);
 	init_tasks[cpu] = idle;
 
diff -Naur linux-2.4.14/fs/proc/proc_misc.c linux-2.4.14-mq/fs/proc/proc_misc.c
--- linux-2.4.14/fs/proc/proc_misc.c	Thu Oct 11 17:46:57 2001
+++ linux-2.4.14-mq/fs/proc/proc_misc.c	Tue Nov  6 22:19:27 2001
@@ -96,7 +96,7 @@
 		LOAD_INT(a), LOAD_FRAC(a),
 		LOAD_INT(b), LOAD_FRAC(b),
 		LOAD_INT(c), LOAD_FRAC(c),
-		nr_running, nr_threads, last_pid);
+		nr_running(), nr_threads, last_pid);
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
diff -Naur linux-2.4.14/include/linux/sched.h linux-2.4.14-mq/include/linux/sched.h
--- linux-2.4.14/include/linux/sched.h	Mon Nov  5 20:42:14 2001
+++ linux-2.4.14-mq/include/linux/sched.h	Tue Nov  6 22:46:17 2001
@@ -72,7 +72,7 @@
 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
 
-extern int nr_running, nr_threads;
+extern int nr_threads;
 extern int last_pid;
 
 #include <linux/fs.h>
@@ -132,14 +132,8 @@
 
 #include <linux/spinlock.h>
 
-/*
- * This serializes "schedule()" and also protects
- * the run-queue from deletions/modifications (but
- * _adding_ to the beginning of the run-queue has
- * a separate lock).
- */
+
 extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;
 
 extern void sched_init(void);
@@ -437,6 +431,7 @@
 #define DEF_COUNTER	(10*HZ/100)	/* 100 ms time slice */
 #define MAX_COUNTER	(20*HZ/100)
 #define DEF_NICE	(0)
+#define ALL_CPUS_ALLOWED (-1)
 
 
 /*
@@ -461,7 +456,7 @@
     policy:		SCHED_OTHER,					\
     mm:			NULL,						\
     active_mm:		&init_mm,					\
-    cpus_allowed:	-1,						\
+    cpus_allowed:	ALL_CPUS_ALLOWED,				\
     run_list:		LIST_HEAD_INIT(tsk.run_list),			\
     next_task:		&tsk,						\
     prev_task:		&tsk,						\
@@ -846,18 +841,497 @@
 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
 
-static inline void del_from_runqueue(struct task_struct * p)
+static inline int task_on_runqueue(struct task_struct *p)
+{
+	return (p->run_list.next != NULL);
+}
+
+/*
+ * runqueue_data
+ * 	One runqueue per CPU in the system, plus one additional runqueue for
+ * realtime tasks.  Size should be a multiple of cache line size, and array
+ * of items should start on a cache line boundary.
+ */
+typedef union runqueue_data {
+	struct rq_data {
+		int nt_running;			/* # of tasks on runqueue */
+#ifdef CONFIG_SMP
+		int max_na_goodness;		/* maximum non-affinity */
+						/* goodness value of    */
+						/* 'schedulable' task   */
+						/* on this runqueue     */
+		struct task_struct * max_na_ptr; /* pointer to task which */
+						 /* has max_na_goodness   */
+		unsigned long max_na_cpus_allowed; /* copy of cpus_allowed */
+						   /* field from task with */
+						   /* max_na_goodness      */
+#endif
+		struct list_head runqueue;	/* list of tasks on runqueue */
+#ifdef CONFIG_SMP
+		int running_non_idle;		/* flag to indicate this cpu */
+						/* is running something      */
+						/* besides the idle thread   */
+#endif
+		spinlock_t runqueue_lock;	/* lock for this runqueue */
+	} rq_data;
+	char __pad [SMP_CACHE_BYTES];
+} runqueue_data_t;
+#define nt_running(cpu) runqueue_data[(cpu)].rq_data.nt_running
+#ifdef CONFIG_SMP
+#define max_na_goodness(cpu) runqueue_data[(cpu)].rq_data.max_na_goodness
+#define max_na_ptr(cpu) runqueue_data[(cpu)].rq_data.max_na_ptr
+#define max_na_cpus_allowed(cpu) \
+	runqueue_data[(cpu)].rq_data.max_na_cpus_allowed
+#endif
+#define runqueue(cpu) runqueue_data[(cpu)].rq_data.runqueue
+#define runqueue_lock(cpu) runqueue_data[(cpu)].rq_data.runqueue_lock
+#ifdef CONFIG_SMP
+#define running_non_idle(cpu) runqueue_data[(cpu)].rq_data.running_non_idle
+#endif
+extern runqueue_data_t runqueue_data[];
+
+#ifdef CONFIG_SMP
+#define INIT_RUNQUEUE_DATA_SMP(n) {				\
+	max_na_goodness((n)) = MIN_GOODNESS;		\
+	max_na_ptr((n)) = NULL;					\
+	/* max_na_cpus_allowed need not be initialized */	\
+	running_non_idle((n)) = 0;				\
+}
+#else
+#define INIT_RUNQUEUE_DATA_SMP(n) 	/* NOOP */
+#endif
+#define INIT_RUNQUEUE_DATA(n) {					\
+	nt_running((n)) = 0;					\
+	INIT_LIST_HEAD(&runqueue((n)));				\
+	runqueue_lock((n)) = SPIN_LOCK_UNLOCKED;		\
+	INIT_RUNQUEUE_DATA_SMP((n));				\
+}
+#define N_RUNQUEUES		(NR_CPUS + 1)
+#define N_ALIGNED_DATA		NR_CPUS
+#define	REALTIME_RQ_ID		NR_CPUS
+#define MIN_GOODNESS		-1000
+#ifndef CONFIG_SMP
+#define	UP_RQ_LOCK_ID		0
+#endif
+
+/*
+ * aligned_data
+ *	CPU specific scheduling data.  One data item for each CPU
+ * in the system.  Size should be a multiple of cache line size,
+ * and array of items should start on a cache line boundary.
+ */
+typedef union aligned_data {
+	struct schedule_data {
+		struct task_struct * curr;	/* current task on this CPU */
+		cycles_t last_schedule;		/* time of last scheduling */
+						/* decision                */
+#ifdef CONFIG_SMP
+		int curr_na_goodness;		/* non-affinity goodness */
+						/* value of current task */
+#endif
+	} schedule_data;
+	char __pad [SMP_CACHE_BYTES];
+} aligned_data_t;
+#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+#ifdef CONFIG_SMP
+#define curr_na_goodness(cpu) aligned_data[(cpu)].schedule_data.curr_na_goodness
+#endif
+#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+extern aligned_data_t aligned_data[];
+
+#ifdef CONFIG_SMP
+#define INIT_ALIGNED_DATA_SMP(n) {			\
+	curr_na_goodness((n)) = MIN_GOODNESS;	\
+}
+#else
+#define INIT_ALIGNED_DATA_SMP(n) 	/* NOOP */
+#endif
+#define INIT_ALIGNED_DATA(n) {				\
+	cpu_curr((n)) = &init_task;			\
+	last_schedule((n)) = 0;				\
+	INIT_ALIGNED_DATA_SMP((n));			\
+}
+
+/*
+ * Determine runqueue associated with task
+ */
+static inline int task_to_runqueue(struct task_struct *t)
+{
+	int rq;
+
+	if ((t->policy & ~SCHED_YIELD) != SCHED_OTHER) {
+		rq = REALTIME_RQ_ID;
+	} else {
+		rq = t->processor;
+	}
+
+	return(rq);
+}
+#define TASK_RQ(t)	runqueue(task_to_runqueue((t)))
+
+/*
+ * Sum CPU specific nt_running fields to determine how many
+ * runnable tasks there are in the system.
+ */
+static inline int nr_running(void)
+{
+	int i;
+	int tot=nt_running(REALTIME_RQ_ID);
+
+	for(i=0; i<smp_num_cpus; i++) {
+		tot += nt_running(cpu_logical_map(i));
+	}
+
+	return(tot);
+}
+
+/*
+ * The following macros and the base_goodness() routine contain common
+ * code for the 'exported' goodness routines which follow.
+ */
+#define RT_GOODNESS_MIN	1000
+#define RT_GOODNESS(t) (RT_GOODNESS_MIN + (t)->rt_priority)
+#define MM_GOODNESS(t, this_mm)	((t)->mm == (this_mm) ? 1 : 0)
+#define CPU_GOODNESS(t, this_cpu) ((t)->processor == (this_cpu) ? \
+					PROC_CHANGE_PENALTY : 0)
+static inline int base_goodness(struct task_struct * t)
+{
+	int weight;
+
+	weight = -1;
+	if (t->policy & SCHED_YIELD)
+		goto out;
+
+	/*
+	 * base_goodness is based on the nuber of ticks left.
+	 * Don't do any other calculations if the time slice is
+	 * over..
+	 */
+	weight = t->counter;
+	if (!weight)
+		goto out;
+			
+	/*
+	 * Factor in the nice value
+	 */
+	weight += 20 - t->nice;
+
+out:
+	return weight;
+}
+
+/*
+ * non-affinity goodness value of a task.  MM and CPU affinity are not
+ * taken into account.
+ */
+static inline int na_goodness(struct task_struct * t)
+{
+	/*
+	 * Normal tasks first
+	 */
+	if ((t->policy & ~SCHED_YIELD) == SCHED_OTHER) {
+		return (base_goodness(t));
+	}
+
+	/*
+	 * Realtime task
+	 */
+	return (RT_GOODNESS(t));
+}
+
+/*
+ * Stripped down version of goodness routine to be used on a CPU
+ * specific (local) runqueue.  This routine does not need to be
+ * concerned with realtime tasks, and does not need to take CPU
+ * affinity into account.
+ */
+static inline int local_goodness(struct task_struct * t,
+						struct mm_struct *this_mm)
+{
+	int weight = base_goodness(t);
+
+	if (weight > 0) {
+		weight += MM_GOODNESS(t, this_mm);
+	}
+
+	return(weight);
+}
+
+/*
+ * Full Blown goodness function, this is the function that decides how
+ * desirable a process is.  You can weigh different processes against
+ * each other depending on what CPU they've run on lately etc to try to
+ * handle cache and TLB miss penalties.
+ *
+ * Return values:
+ *               <0: dont select this one
+ *                0: out of time, recalculate counters (but it might still be
+ *                   selected)
+ *              +ve: "goodness" value (the larger, the better)
+ * +RT_GOODNESS_MIN: realtime process, select this.
+ */
+static inline int goodness(struct task_struct * t, int this_cpu,
+						struct mm_struct *this_mm)
 {
-	nr_running--;
+	int weight;
+
+	/*
+	 * Normal tasks first
+	 */
+	if ((t->policy & ~SCHED_YIELD) == SCHED_OTHER) {
+		weight = base_goodness(t);
+		if (weight > 0) {
+			weight += MM_GOODNESS(t, this_mm);
+#ifdef CONFIG_SMP
+			weight += CPU_GOODNESS(t, this_cpu);
+#endif
+		}
+		return(weight);
+	}
+
+	/*
+	 * Realtime task
+	 */
+	return (RT_GOODNESS(t));
+}
+
+/*
+ * Common code for add to runqueue.  In SMP update max_na_* values
+ * for runquue if appropriate.
+ */
+static inline void add_to_runqueue_common(struct task_struct * p, int upd)
+{
+	int rq = task_to_runqueue(p);
+#ifdef CONFIG_SMP
+	int tsk_na_goodness = na_goodness(p);
+
+	if (upd &&
+	    !p->has_cpu && (tsk_na_goodness > max_na_goodness(rq))) {
+		max_na_goodness(rq) = tsk_na_goodness;
+		max_na_cpus_allowed(rq) = p->cpus_allowed;
+		max_na_ptr(rq) = p;
+	}
+#endif
+	list_add(&p->run_list, &runqueue(rq));
+	nt_running(rq)++;
+}
+static inline void add_to_runqueue(struct task_struct * p)
+{
+	add_to_runqueue_common(p, 1);
+}
+static inline void add_to_runqueue_noupd(struct task_struct * p)
+{
+	add_to_runqueue_common(p, 0);
+}
+
+/*
+ * Common routine for both flavors of del_from_runqueue.  Expensive scan
+ * of runqueue only happens in SMP is explicitly requested.
+ */
+static inline void del_from_runqueue_common(struct task_struct * p, int upd)
+{
+	int rq = task_to_runqueue(p);
+
+	nt_running(rq)--;
 	p->sleep_time = jiffies;
 	list_del(&p->run_list);
 	p->run_list.next = NULL;
+
+#ifdef CONFIG_SMP
+	if (max_na_ptr(rq) == p) {
+		if (upd) {
+			/*
+			 * If we want to update max_na_* valies for the
+			 * runqueue, then we scan the queue and look for
+			 * the FIRST schedulable task.  This is a 'good
+			 * enough' approximation.
+			 */
+			struct list_head *tmp;
+			struct task_struct *t, *tmp_task = NULL;
+			int weight, tmp_weight = 0;
+
+			list_for_each(tmp, &runqueue(rq)) {
+				t = list_entry(tmp, struct task_struct,
+								run_list);
+				if (!t->has_cpu) {
+					weight = na_goodness(t);
+					if (weight > tmp_weight) {
+						tmp_weight = weight;
+						tmp_task = t;
+						goto found_one;
+					}
+				}
+			}
+found_one:
+			if (tmp_weight) {
+				max_na_goodness(rq) = tmp_weight;
+				max_na_cpus_allowed(rq) =
+							tmp_task->cpus_allowed;
+				max_na_ptr(rq) = tmp_task;
+			} else {
+				max_na_goodness(rq) = MIN_GOODNESS;
+				max_na_ptr(rq) = NULL;
+			}
+		} else {
+			max_na_goodness(rq) = MIN_GOODNESS;
+			max_na_ptr(rq) = NULL;
+		}
+	}
+#endif
+}
+/*
+ * del_from_runqueue without updating max_na_* values.  Used in
+ * places where we know we will updating these values before
+ * dropping the runqueue lock.
+ */
+static inline void del_from_runqueue(struct task_struct * p)
+{
+	del_from_runqueue_common(p, 0);
+}
+/*
+ * del_from_runqueue_update will update the max_na_* values
+ * if necessary.
+ */
+static inline void del_from_runqueue_update(struct task_struct * p)
+{
+	del_from_runqueue_common(p, 1);
 }
 
-static inline int task_on_runqueue(struct task_struct *p)
+/*
+ * Macros to call runqueue locking routines
+ */
+#ifdef CONFIG_SMP
+#define LOCK_REALTIME_RQ()					\
+		spin_lock(&runqueue_lock(REALTIME_RQ_ID));
+#define UNLOCK_REALTIME_RQ()					\
+		spin_unlock(&runqueue_lock(REALTIME_RQ_ID));
+#else
+#define LOCK_REALTIME_RQ()		/* NOOP */
+#define UNLOCK_REALTIME_RQ()		/* NOOP */
+#endif
+#define LOCK_TASK_CPU_RQ_IRQ(t)					\
+		lock_task_cpu_rq_irq(t)
+#define UNLOCK_TASK_CPU_RQ_IRQ(t)				\
+		unlock_task_cpu_rq_irq(t)
+#define LOCK_TASK_RQ_VERIFY(t)					\
+		lock_task_rq_verify(t)
+#ifdef CONFIG_SMP
+#define UNLOCK_TASK_RQ(t)					\
+		spin_unlock(&runqueue_lock(task_to_runqueue((t))));
+#else
+#define UNLOCK_TASK_RQ(t)					\
+		spin_unlock(&runqueue_lock(UP_RQ_LOCK_ID));
+#endif
+#define LOCK_TASK_CPU_RQ_IRQSAVE_VERIFY(t, flags)		\
+		lock_task_cpu_rq_irqsave_verify(t, flags)
+#define UNLOCK_TASK_CPU_RQ_IRQSAVE(cpu_rq, t, flags)		\
+		unlock_task_cpu_rq_irqsave(cpu_rq, t, flags)
+#define LOCK_TASK_CPU_RQ_IRQ_VERIFY(t)				\
+		lock_task_cpu_rq_irq_verify(t)
+
+static inline void lock_task_cpu_rq_irq(struct task_struct *t)
 {
-	return (p->run_list.next != NULL);
+#ifdef CONFIG_SMP
+	spin_lock_irq(&runqueue_lock(t->processor));
+	if (task_to_runqueue(t) == REALTIME_RQ_ID) {
+		LOCK_REALTIME_RQ();
+	}
+#else
+	spin_lock_irq(&runqueue_lock(UP_RQ_LOCK_ID));
+#endif
+}
+
+static inline void unlock_task_cpu_rq_irq(struct task_struct *t)
+{
+#ifdef CONFIG_SMP
+	if (task_to_runqueue(t) == REALTIME_RQ_ID) {
+		UNLOCK_REALTIME_RQ();
+	}
+	spin_unlock_irq(&runqueue_lock(t->processor));
+#else
+	spin_unlock_irq(&runqueue_lock(UP_RQ_LOCK_ID));
+#endif
+}
+
+static inline void lock_task_rq_verify(struct task_struct *t)
+{
+#ifdef CONFIG_SMP
+	int rq = task_to_runqueue(t);
+
+	spin_lock(&runqueue_lock(rq));
+	while (rq != task_to_runqueue(t)) {
+		spin_unlock(&runqueue_lock(rq));
+		rq = task_to_runqueue(t);
+		spin_lock(&runqueue_lock(rq));
+	}
+#else
+	spin_lock(&runqueue_lock(UP_RQ_LOCK_ID));
+#endif
 }
+
+static inline int lock_task_cpu_rq_irqsave_verify(struct task_struct *t,
+						unsigned long *flags)
+{
+#ifdef CONFIG_SMP
+	int rq = t->processor;
+
+	spin_lock_irqsave(&runqueue_lock(rq), *flags);
+	while (t->processor != rq) {
+		spin_unlock_irqrestore(&runqueue_lock(rq), *flags);
+		rq = t->processor;
+		spin_lock_irqsave(&runqueue_lock(rq), *flags);
+	}
+	if (task_to_runqueue(t) == REALTIME_RQ_ID) {
+		LOCK_REALTIME_RQ();
+	}
+	return(rq);
+#else
+	spin_lock_irqsave(&runqueue_lock(UP_RQ_LOCK_ID), *flags);
+	return(UP_RQ_LOCK_ID);
+#endif
+}
+
+static inline void unlock_task_cpu_rq_irqsave(int cpu_rq, struct task_struct *t,
+						unsigned long flags)
+{
+#ifdef CONFIG_SMP
+	int rq = task_to_runqueue(t);
+
+	if (rq == REALTIME_RQ_ID) {
+		UNLOCK_REALTIME_RQ();
+	}
+	spin_unlock_irqrestore(&runqueue_lock(cpu_rq), flags);
+#else
+	spin_unlock_irqrestore(&runqueue_lock(UP_RQ_LOCK_ID), flags);
+#endif
+}
+
+static inline void lock_task_cpu_rq_irq_verify(struct task_struct *t)
+{
+#ifdef CONFIG_SMP
+	int rq = t->processor;
+
+	spin_lock_irq(&runqueue_lock(rq));
+	while (t->processor != rq) {
+		spin_unlock_irq(&runqueue_lock(rq));
+		rq = t->processor;
+		spin_lock_irq(&runqueue_lock(rq));
+	}
+	if (task_to_runqueue(t) == REALTIME_RQ_ID) {
+		LOCK_REALTIME_RQ();
+	}
+#else
+	spin_lock_irq(&runqueue_lock(UP_RQ_LOCK_ID));
+#endif
+}
+
+#ifdef CONFIG_SMP
+#define UPDATE_SCHED_DATA(tc, next)	update_sched_data(tc, next)
+#define EXAMINE_RMT_RQS(tc, c, p, n)	examine_rmt_rqs(tc, c, p, n)
+#else
+#define UPDATE_SCHED_DATA(tc, next)	/* NOOP */
+#define EXAMINE_RMT_RQS(tc, c, p, n)	(n)
+#endif
 
 static inline void unhash_process(struct task_struct *p)
 {
diff -Naur linux-2.4.14/kernel/fork.c linux-2.4.14-mq/kernel/fork.c
--- linux-2.4.14/kernel/fork.c	Wed Oct 24 00:44:15 2001
+++ linux-2.4.14-mq/kernel/fork.c	Tue Nov  6 22:19:27 2001
@@ -28,7 +28,6 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
diff -Naur linux-2.4.14/kernel/sched.c linux-2.4.14-mq/kernel/sched.c
--- linux-2.4.14/kernel/sched.c	Wed Oct 17 21:14:37 2001
+++ linux-2.4.14-mq/kernel/sched.c	Tue Nov  6 22:19:27 2001
@@ -80,33 +80,20 @@
 /*
  * The tasklist_lock protects the linked list of processes.
  *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
- *
  * If both locks are to be concurrently held, the runqueue_lock
  * nests inside the tasklist_lock.
  *
  * task->alloc_lock nests inside tasklist_lock.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
-
 /*
- * We align per-CPU scheduling data on cacheline boundaries,
- * to prevent cacheline ping-pong.
+ * runqueue_data and aligned_data contain CPU specific scheduling data.
+ * There is one runqueue per CPU in the system, and SMP kernels also
+ * contain a realtime runquue.  Initialization is performed in sched_init().
  */
-static union {
-	struct schedule_data {
-		struct task_struct * curr;
-		cycles_t last_schedule;
-	} schedule_data;
-	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
-
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+runqueue_data_t runqueue_data [N_RUNQUEUES] __cacheline_aligned;
+aligned_data_t aligned_data [N_ALIGNED_DATA] __cacheline_aligned;
 
 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;
@@ -116,6 +103,8 @@
 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
 				((p)->cpus_allowed & (1 << cpu)))
+#define local_can_schedule(p) (!(p)->has_cpu)
+#define this_cpu_allowed(ca, tcpu) ((ca) & (1 << tcpu))
 
 #else
 
@@ -127,72 +116,6 @@
 void scheduling_functions_start_here(void) { }
 
 /*
- * This is the function that decides how desirable a process is..
- * You can weigh different processes against each other depending
- * on what CPU they've run on lately etc to try to handle cache
- * and TLB miss penalties.
- *
- * Return values:
- *	 -1000: never select this
- *	     0: out of time, recalculate counters (but it might still be
- *		selected)
- *	   +ve: "goodness" value (the larger, the better)
- *	 +1000: realtime process, select this.
- */
-
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
-{
-	int weight;
-
-	/*
-	 * select the current process after every other
-	 * runnable process, but before the idle thread.
-	 * Also, dont trigger a counter recalculation.
-	 */
-	weight = -1;
-	if (p->policy & SCHED_YIELD)
-		goto out;
-
-	/*
-	 * Non-RT process - normal case first.
-	 */
-	if (p->policy == SCHED_OTHER) {
-		/*
-		 * Give the process a first-approximation goodness value
-		 * according to the number of clock-ticks it has left.
-		 *
-		 * Don't do any other calculations if the time slice is
-		 * over..
-		 */
-		weight = p->counter;
-		if (!weight)
-			goto out;
-			
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
-#endif
-
-		/* .. and a slight advantage to the current MM */
-		if (p->mm == this_mm || !p->mm)
-			weight += 1;
-		weight += 20 - p->nice;
-		goto out;
-	}
-
-	/*
-	 * Realtime process, select the first one on the
-	 * runqueue (taking priorities within processes
-	 * into account).
-	 */
-	weight = 1000 + p->rt_priority;
-out:
-	return weight;
-}
-
-/*
  * the 'goodness value' of replacing a process on a given CPU.
  * positive value means 'replace', zero or negative means 'dont'.
  */
@@ -202,126 +125,212 @@
 }
 
 /*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
+ * reschedule_idle - Determine which CPU the specified task should
+ * should run on.  The runqueue lock must be held upon entry to this
+ * routine.
  */
 static FASTCALL(void reschedule_idle(struct task_struct * p));
 
 static void reschedule_idle(struct task_struct * p)
 {
 #ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
+	struct task_struct *tsk;
+	int target_cpu, this_cpu, tsk_cpu;
+	int i, cpu;
+	int need_resched;
+	cycles_t curr_cycles, tmp_cycles;
+	int stack_list[NR_CPUS];
+	int saved_na_goodness, tmp_min_na_goodness;
+
+	tsk_cpu = p->processor;
+	this_cpu = smp_processor_id();
+	/*
+	 * First check if the task's previous CPU is idle,  use it if it is.
+	 */
+	if (can_schedule(p, tsk_cpu) &&
+	    (cpu_curr(tsk_cpu) == idle_task(tsk_cpu))) {
+		if (!task_on_runqueue(p)) {
+			add_to_runqueue(p);
+		}
+		tsk = cpu_curr(tsk_cpu);
+		need_resched = tsk->need_resched;
+		tsk->need_resched = 1;
+		if ((tsk_cpu != this_cpu) && !need_resched) {
+			smp_send_reschedule(tsk_cpu);
+		}
+		return;
+	}
 
 	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
-	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
+	 * Create a list of current na_goodness values on our stack.
+	 * Only values less than the non-affinity goodness value of
+	 * p should be considered for preemption.
+	 */
+	saved_na_goodness = na_goodness(p); /* preemption_goodness() > 0 */
+	tmp_min_na_goodness = saved_na_goodness;
+	curr_cycles = get_cycles();
+	target_cpu = -1;
+	for (i = 0; i < smp_num_cpus; i++) {
+		cpu = cpu_logical_map(i);
+
+		if (!can_schedule(p, cpu)) {
+			stack_list[cpu] = saved_na_goodness;
+			continue;
+		}
+
+		if (curr_na_goodness(cpu) == MIN_GOODNESS) {
 			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
+			 * Indicates an idle task.  For idle tasks, determine
+			 * the amount of time they have been idle.  Use the
+			 * negative of this value in the list.  Hence, we
+			 * first choose the CPU that has been idle the longest.
 			 */
-			need_resched = tsk->need_resched;
-			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
-			return;
+			tmp_cycles = curr_cycles - last_schedule(cpu);
+			if (tmp_cycles > INT_MAX) {
+				stack_list[cpu] = INT_MIN;
+			} else {
+				stack_list[cpu] = (int)-tmp_cycles;
+			}
+		} else {
+			stack_list[cpu] = curr_na_goodness(cpu);
+			/*
+			 * Add in PROC_CHANGE_PENALTY for remote CPUs
+			 */
+			if (cpu != tsk_cpu) {
+				stack_list[cpu] += PROC_CHANGE_PENALTY;
+			}
+		}
+
+		/*
+		 * Look for the lowest value
+		 */
+		if (stack_list[cpu] < tmp_min_na_goodness) {
+			target_cpu = cpu;
+			tmp_min_na_goodness = stack_list[cpu];
 		}
 	}
 
 	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
+	 * We try to add the task to a runqueue starting with the one
+	 * that has the lowest na_goodness value.
+	 */
+	while (target_cpu != -1) {
+		if (target_cpu == tsk_cpu &&
+		    preemption_goodness((tsk = cpu_curr(target_cpu)),
+					p, target_cpu) > 0) {
+			/*
+			 * If target_cpu is tsk_cpu, then no additional
+			 * locking is required (we already have the CPU
+			 * specific runqueue locked).  We also know that
+			 * this CPU can not be idle, otherwise the fast
+			 * path at the beginning of this routine would
+			 * have been executed.  Therefore, simply send
+			 * the IPI if required.
+			 */
+			if (!task_on_runqueue(p)) {
+				add_to_runqueue(p);
+			}
+			tsk = cpu_curr(target_cpu);
+			tsk->need_resched = 1;
+			if (target_cpu != this_cpu) {
+				smp_send_reschedule(target_cpu);
+			}
+			return;
+		}
 
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
 		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
+		 * Try to lock runqueue and verify na_goodness value.
 		 */
-		if (tsk == idle_task(cpu)) {
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
+		else if (spin_trylock(&runqueue_lock(target_cpu))) {
+			tsk = cpu_curr(target_cpu);
+			if ((tsk == idle_task(target_cpu)) ||
+			     (preemption_goodness(tsk, p, target_cpu) > 0)) {
+				/*
+				 * Target CPU is idle, or it is running
+				 * a task with lower priority than p.
+				 * Therefore, move p to target runqueue.
+				 */
+				if (task_on_runqueue(p)) {
+					del_from_runqueue_update(p);
+				}
+				p->processor = target_cpu;
+				add_to_runqueue(p);
 
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
+				/*
+				 * Send an IPI to target CPU, unless the
+				 * CPU is idle and the need_resched flag
+				 * has already been set.
+				 */
+				need_resched = tsk->need_resched;
+				tsk->need_resched = 1;
+				if ((target_cpu != this_cpu) &&
+				    ((tsk != idle_task(target_cpu)) ||
+				      !need_resched)){
+					smp_send_reschedule(target_cpu);
 				}
+
+				spin_unlock(&runqueue_lock(target_cpu));
+
+				return;
 			}
+			spin_unlock(&runqueue_lock(target_cpu));
 		}
-	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
+
+		/*
+		 * Update list value so we don't check this CPU again.
+		 */
+		stack_list[target_cpu] = saved_na_goodness;
+
+		/*
+		 * Find the 'next lowest' cur_na_goodness value.
+		 */
+		target_cpu = -1;
+		tmp_min_na_goodness = saved_na_goodness;
+		for (i = 0; i < smp_num_cpus; i++) {
+			cpu = cpu_logical_map(i);
+			if (stack_list[cpu] < tmp_min_na_goodness) {
+				target_cpu = cpu;
+				tmp_min_na_goodness = stack_list[cpu];
+			}
 		}
-		tsk->need_resched = 1;
-		if (tsk->processor != this_cpu)
-			smp_send_reschedule(tsk->processor);
+	}
+
+	/*
+	 * If we get here, it means that the best place for the task is
+	 * on its currently assigned runqueue.  Also, we know that the
+	 * task currently running on this task's runqueue has sufficuent
+	 * priority to prevent preemption.  Hence, we simply ensure the
+	 * task is on the runqueue.
+	 */
+	if (!task_on_runqueue(p)) {
+		add_to_runqueue(p);
 	}
 	return;
-		
 
 #else /* UP */
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk;
 
 	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
+	if (preemption_goodness(tsk, p, this_cpu) > 0) {
 		tsk->need_resched = 1;
+	}
+	if (!task_on_runqueue(p)) {
+		add_to_runqueue(p);
+	}
 #endif
 }
 
-/*
- * Careful!
- *
- * This has to add the process to the _beginning_ of the
- * run-queue, not the end. See the comment about "This is
- * subtle" in the scheduler proper..
- */
-static inline void add_to_runqueue(struct task_struct * p)
-{
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
-}
-
 static inline void move_last_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	list_add_tail(&p->run_list, &TASK_RQ(p));
 }
 
 static inline void move_first_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &TASK_RQ(p));
 }
 
 /*
@@ -336,20 +345,24 @@
 {
 	unsigned long flags;
 	int success = 0;
+	int cpu_rq;
+
+	cpu_rq = LOCK_TASK_CPU_RQ_IRQSAVE_VERIFY(p, &flags);
 
 	/*
 	 * We want the common case fall through straight, thus the goto.
 	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
 	p->state = TASK_RUNNING;
 	if (task_on_runqueue(p))
 		goto out;
-	add_to_runqueue(p);
+
 	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
 		reschedule_idle(p);
+	else
+		add_to_runqueue(p);
 	success = 1;
 out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	UNLOCK_TASK_CPU_RQ_IRQSAVE(cpu_rq, p, flags);
 	return success;
 }
 
@@ -495,6 +508,7 @@
 needs_resched:
 	{
 		unsigned long flags;
+		int cpu_rq;
 
 		/*
 		 * Avoid taking the runqueue lock in cases where
@@ -504,15 +518,17 @@
 						(policy & SCHED_YIELD))
 			goto out_unlock;
 
-		spin_lock_irqsave(&runqueue_lock, flags);
+		cpu_rq = LOCK_TASK_CPU_RQ_IRQSAVE_VERIFY(prev, &flags);
+
 		if ((prev->state == TASK_RUNNING) && !prev->has_cpu)
 			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
+
+		UNLOCK_TASK_CPU_RQ_IRQSAVE(cpu_rq, prev, flags);
 		goto out_unlock;
 	}
 #else
 	prev->policy &= ~SCHED_YIELD;
-#endif /* CONFIG_SMP */
+#endif
 }
 
 void schedule_tail(struct task_struct *prev)
@@ -520,11 +536,322 @@
 	__schedule_tail(prev);
 }
 
+#ifdef CONFIG_SMP
 /*
- *  'schedule()' is the scheduler function. It's a very simple and nice
- * scheduler: it's not perfect, but certainly works for most things.
- *
- * The goto is "interesting".
+ * Examine remote runqueues and look for a task which is more desirable
+ * to run than 'next'.
+ */
+static FASTCALL(struct task_struct *examine_rmt_rqs(int this_cpu, int *cg,
+					struct task_struct *prev,
+					struct task_struct *next));
+static struct task_struct *examine_rmt_rqs(int this_cpu, int *cg,
+					struct task_struct *prev,
+					struct task_struct *next)
+{
+	int premature_idle;
+	int rrq;
+	int tmp_na_goodness;
+	int rq;
+	int rcpu;
+	int retry;
+	int stack_list[NR_CPUS];
+
+	/*
+	 * cg (current goodness) does not contain a CPU affinity boost.
+	 * We must add this boost before comparing to tasks on other
+	 * runqueues.  Only add PROC_CHANGE_PENALTY if c is a positive
+	 * goodness value.
+	 */
+	if (*cg > 0) {
+		*cg += PROC_CHANGE_PENALTY;
+	}
+
+	/*
+	 * Copy max_na_goodness values from CPU specific runqueue
+	 * structures to the list on our stack.
+	 */
+scan_again:
+	premature_idle = 0;
+	rrq = -1;
+	tmp_na_goodness = *cg;
+	for (rq = 0; rq < smp_num_cpus; rq++) {
+		rcpu = cpu_logical_map(rq);
+		if (rcpu == this_cpu) {
+			stack_list[rcpu] = MIN_GOODNESS;
+			continue;
+		}
+		if (!this_cpu_allowed(max_na_cpus_allowed(rcpu), this_cpu)) {
+			stack_list[rcpu] = MIN_GOODNESS;
+			continue;
+		}
+		if (max_na_goodness(rcpu) <= *cg) {
+			stack_list[rcpu] = MIN_GOODNESS;
+			continue;
+		}
+
+		stack_list[rcpu] = max_na_goodness(rcpu);
+		if (stack_list[rcpu] > tmp_na_goodness) {
+			rrq = rcpu;
+			tmp_na_goodness = stack_list[rcpu];
+		}
+	}
+
+	/*
+	 * Now use the values from the stack list to search for a
+	 * task to steal.
+	 */
+	while (rrq != -1) {
+		/*
+		 * First try to lock the remote runqueue and verify
+		 * the max_na_goodness value.
+		 */
+		if (spin_trylock(&runqueue_lock(rrq))) {
+			if (max_na_goodness(rrq) > *cg && 
+			    this_cpu_allowed(max_na_cpus_allowed(rrq),
+								this_cpu)) {
+				/*
+				 * Move a remote task to our runqueue,
+				 * don't forget to update max_na_values
+				 * for our queue.
+				 */
+				if (!next->has_cpu && 
+				    next != idle_task(this_cpu)) {
+					max_na_goodness(this_cpu) =
+							na_goodness(next);
+					max_na_cpus_allowed(this_cpu) =
+							next->cpus_allowed;
+					max_na_ptr(this_cpu) = next;
+				}
+				next = max_na_ptr(rrq);;
+				*cg = max_na_goodness(rrq);
+				del_from_runqueue_update(next);
+				next->processor = this_cpu;
+				add_to_runqueue_noupd(next);
+
+				/*
+				 * We have stolen a task from another
+				 * runqueue, quit looking.
+				 */
+				spin_unlock(&runqueue_lock(rrq));
+					
+				break;
+			}
+			spin_unlock(&runqueue_lock(rrq));
+		} else {
+			premature_idle++;
+		}
+
+		/*
+		 * We were either unable to get the remote runqueue lock,
+		 * or the remote runqueue has changed such that it is no
+		 * longer desirable to steal a task from the queue.
+		 *
+		 * Go to the runqueue with the 'next highest' max_na_goodness
+		 * value.
+		 */
+		stack_list[rrq] = MIN_GOODNESS;
+		tmp_na_goodness = *cg;
+		rrq = -1;
+		for (rq = 0; rq < smp_num_cpus; rq++) {
+			rcpu = cpu_logical_map(rq);
+			if (stack_list[rcpu] > tmp_na_goodness) {
+				rrq = rcpu;
+				tmp_na_goodness = stack_list[rcpu];
+			}
+		}
+	}
+
+	/*
+	 * Check for going idle prematurely.  If this is the case, there
+	 * is a good chance there are schedulable tasks on other runquueues.
+	 */
+	retry = (next == idle_task(this_cpu)) &&
+		(task_to_runqueue(prev) != REALTIME_RQ_ID);
+	if (retry && premature_idle) {
+		/*
+		 * Be sure to clear max_na_goodness, otherwise there
+		 * is the potential for deadlock.
+		 */
+		max_na_goodness(this_cpu) = MIN_GOODNESS;
+		goto scan_again;
+	}
+
+	return(next);
+}
+
+/*
+ * Find next best schedulable task on runqueue. In addition, while scanning
+ * the queue keep track of the 'second best' schedulable task and update
+ * runquue max_na_* values accordingly.
+ */
+static FASTCALL(struct task_struct *scan_runqueue(struct task_struct *prev,
+		int *cg, struct task_struct *next));
+static struct task_struct *scan_runqueue(struct task_struct *prev, int *cg,
+		struct task_struct *next)
+{
+	struct task_struct *p;
+	struct list_head *tmp;
+	int prev_next_weight;
+	struct task_struct *prev_next;
+	int rq = prev->processor;
+
+	prev_next = idle_task(rq);
+	prev_next_weight = MIN_GOODNESS;
+	/*
+	 * Search local (CPU specific) runqueue
+	 */
+	list_for_each(tmp, &runqueue(rq)) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		if (local_can_schedule(p)) {
+			int weight = local_goodness(p, prev->active_mm);
+			if (weight > *cg) {
+				if (!next->has_cpu) {
+					/*
+					 * prev_next must point to a
+					 * schedulable task.
+					 */
+					prev_next_weight = *cg;
+					prev_next = next;
+				}
+				*cg = weight;
+				next = p;
+			} else if (weight > prev_next_weight) {
+				prev_next_weight = weight;
+				prev_next = p;
+			}
+		}
+	}
+
+	/*
+	 * Update max_na_* values for this runqueue
+	 */
+	if (prev_next != idle_task(rq)) {
+		max_na_goodness(rq) = na_goodness(prev_next);
+		max_na_cpus_allowed(rq) = prev_next->cpus_allowed;
+		max_na_ptr(rq) = prev_next;
+	} else {
+		max_na_goodness(rq) = MIN_GOODNESS;
+		/* max_na_cpus_allowed need not be set */
+		max_na_ptr(rq) = NULL;
+	}
+
+	return(next);
+}
+
+static FASTCALL(struct task_struct * scan_rt_runqueue(struct task_struct *prev,
+					int *cg, struct task_struct *next));
+static struct task_struct * scan_rt_runqueue(struct task_struct *prev,
+					int *cg, struct task_struct *next)
+{
+	struct list_head *tmp;
+	struct task_struct *p;
+	int this_cpu = prev->processor;
+
+	if (task_to_runqueue(prev) != REALTIME_RQ_ID) {
+		LOCK_REALTIME_RQ();
+	}
+
+	/*
+	 * Scan the queue.  Note that there is no need to
+	 * keep track of max_na_goodness values for the
+	 * realtime runqueue, as they are never used in
+	 * scheduling decisions.
+	 */
+	list_for_each(tmp, &runqueue(REALTIME_RQ_ID)) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		if (can_schedule(p, this_cpu)) {
+			int weight = RT_GOODNESS(p);
+			if (weight > *cg)
+				*cg = weight, next = p;
+		}
+	}
+
+	/*
+	 * If we found a realtime task, make it non-schedulable (set
+	 * has_cpu) before potentially releasing the queue's lock.
+	 * This prevents other CPUs from trying to steal the task
+	 * before we can switch to it.
+	 * Note that we still hold the CPU specific runqueue lock.
+	 */
+	if (task_to_runqueue(next) == REALTIME_RQ_ID) {
+		next->has_cpu = 1;
+	}
+	if (task_to_runqueue(prev) != REALTIME_RQ_ID) {
+		UNLOCK_REALTIME_RQ();
+	}
+
+	return(next);
+}
+
+static inline void update_sched_data(int this_cpu, struct task_struct *next)
+{
+	/*
+	 * Update values in aligned_data
+	 */
+	if (next != idle_task(this_cpu)) {
+		curr_na_goodness(this_cpu) = na_goodness(next);
+		running_non_idle(this_cpu) = 1;
+	} else {
+		curr_na_goodness(this_cpu) = MIN_GOODNESS;
+		running_non_idle(this_cpu) = 0;
+	}
+
+}
+#else
+
+/*
+ * Scan local runqueue UP version, unnecessary searches/updates removed
+ */
+static inline struct task_struct *scan_runqueue(struct task_struct *prev,
+	int *cg, struct task_struct *next)
+{
+	struct task_struct *p;
+	struct list_head *tmp;
+	int this_cpu = prev->processor;
+	int weight;
+
+	/*
+	 * Search local (CPU specific) runqueue
+	 */
+	list_for_each(tmp, &runqueue(this_cpu)) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		weight = local_goodness(p, prev->active_mm);
+		if (weight > *cg)
+			*cg = weight, next = p;
+	}
+
+	return(next);
+}
+
+/*
+ * Scan RT runqueue UP version with unnecessary locking removed
+ */
+static inline struct task_struct * scan_rt_runqueue(struct task_struct *prev,
+					int *cg, struct task_struct *next)
+{
+	struct list_head *tmp;
+	struct task_struct *p;
+	int weight;
+
+	/*
+	 * Scan the queue.  Note that there is no need to
+	 * keep track of max_na_goodness values for the
+	 * realtime runqueue, as they are never used in
+	 * scheduling decisions.
+	 */
+	list_for_each(tmp, &runqueue(REALTIME_RQ_ID)) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		weight = RT_GOODNESS(p);
+		if (weight > *cg)
+			*cg = weight, next = p;
+	}
+
+	return(next);
+}
+#endif
+
+/*
+ *  'schedule()' is the scheduler function.
  *
  *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
  * tasks can run. It can not be killed, and it cannot sleep. The 'state'
@@ -532,13 +859,11 @@
  */
 asmlinkage void schedule(void)
 {
-	struct schedule_data * sched_data;
-	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
+	struct task_struct *prev, *next;
 	int this_cpu, c;
+	int update_prev_processor = 0;
 
-
-	spin_lock_prefetch(&runqueue_lock);
+	spin_lock_prefetch(&runqueue_lock(current->processor));
 
 	if (!current->active_mm) BUG();
 need_resched_back:
@@ -551,12 +876,9 @@
 	release_kernel_lock(prev, this_cpu);
 
 	/*
-	 * 'sched_data' is protected by the fact that we can run
-	 * only one process per CPU.
+	 * Lock runqueue(s) associated with task
 	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
-
-	spin_lock_irq(&runqueue_lock);
+	LOCK_TASK_CPU_RQ_IRQ(prev);
 
 	/* move an exhausted RR process to be last.. */
 	if (prev->policy == SCHED_RR)
@@ -584,34 +906,69 @@
 	 * Default process to select..
 	 */
 	next = idle_task(this_cpu);
-	c = -1000;
+	c = MIN_GOODNESS;
 	if (prev->state == TASK_RUNNING)
 		goto still_running;
 
 still_running_back:
-	list_for_each(tmp, &runqueue_head) {
-		p = list_entry(tmp, struct task_struct, run_list);
-		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
-			if (weight > c)
-				c = weight, next = p;
+	/*
+	 * First check the realtime runqueue
+	 */
+	if (nt_running(REALTIME_RQ_ID)) {
+		next = scan_rt_runqueue(prev, &c, next);
+		if (task_to_runqueue(next) == REALTIME_RQ_ID) {
+			 /* Found a RT task, no need to look elsewhere */
+			goto set_sched_data;
 		}
 	}
 
-	/* Do we need to re-calculate counters? */
+	/*
+	 * Search CPU specific runqueue
+	 */
+	next = scan_runqueue(prev, &c, next);
+
+	/*
+	 * Take a look at the remote runqueues
+	 */
+	next = EXAMINE_RMT_RQS(this_cpu, &c, prev, next);
+
+	/* Do we need to re-calculate counters for ALL tasks? */
 	if (!c)
 		goto recalculate;
+
+set_sched_data:
+	/*
+	 * Update scheduler data
+	 */
+	UPDATE_SCHED_DATA(this_cpu, next);
+
+	/*
+	 * Update scheduling fields in next task structure.
+	 */
+ 	next->has_cpu = 1;
+	next->processor = this_cpu;
+
 	/*
 	 * from this point on nothing can prevent us from
 	 * switching to the next task, save this fact in
 	 * sched_data.
 	 */
-	sched_data->curr = next;
-#ifdef CONFIG_SMP
- 	next->has_cpu = 1;
-	next->processor = this_cpu;
-#endif
-	spin_unlock_irq(&runqueue_lock);
+	cpu_curr(this_cpu) = next;
+
+	UNLOCK_TASK_CPU_RQ_IRQ(prev);
+
+	if (update_prev_processor) {
+		unsigned long allowed = prev->cpus_allowed;
+		/*
+		 * current task (prev) has had its processor field
+		 * updated and should no longer run on this CPU.
+		 */
+		prev->processor = 0;
+		while (!(allowed & 1UL)) {
+			prev->processor++;
+			allowed = allowed >> 1;
+		}
+	}
 
 	if (prev == next) {
 		/* We won't go through the normal tail, so do this by hand */
@@ -627,15 +984,14 @@
 	 * and it's approximate, so we do not have to maintain
 	 * it while holding the runqueue spinlock.
  	 */
- 	sched_data->last_schedule = get_cycles();
+ 	last_schedule(this_cpu) = get_cycles();
 
 	/*
 	 * We drop the scheduler lock early (it's a global spinlock),
 	 * thus we have to lock the previous process from getting
 	 * rescheduled during switch_to().
 	 */
-
-#endif /* CONFIG_SMP */
+#endif
 
 	kstat.context_swtch++;
 	/*
@@ -684,18 +1040,29 @@
 recalculate:
 	{
 		struct task_struct *p;
-		spin_unlock_irq(&runqueue_lock);
+		UNLOCK_TASK_CPU_RQ_IRQ(prev);
 		read_lock(&tasklist_lock);
 		for_each_task(p)
 			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 		read_unlock(&tasklist_lock);
-		spin_lock_irq(&runqueue_lock);
+		LOCK_TASK_CPU_RQ_IRQ(prev);
 	}
 	goto repeat_schedule;
 
 still_running:
-	if (!(prev->cpus_allowed & (1UL << this_cpu)))
+	if (!(prev->cpus_allowed & (1UL << this_cpu))) {
+		/*
+		 * task is no longer runnable on this CPU.  We remove it
+		 * from the runqueue to ensure that it will not be considered
+		 * for scheduling here.  Let schedule_tail take care of
+		 * sending it off to an appropriate CPU.
+		 */
+		if (!update_prev_processor) {
+			del_from_runqueue(prev);
+			update_prev_processor++;
+		}
 		goto still_running_back;
+	}
 	c = goodness(prev, this_cpu, prev->active_mm);
 	next = prev;
 	goto still_running_back;
@@ -913,6 +1280,7 @@
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;
+	int was_on_rq = 0;
 
 	retval = -EINVAL;
 	if (!param || pid < 0)
@@ -922,18 +1290,20 @@
 	if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 		goto out_nounlock;
 
+	retval = -ESRCH;
 	/*
-	 * We play safe to avoid deadlocks.
+	 * Note that here we get the tasklist lock so that we can
+	 * find the task struct.  Not until we have access to the
+	 * task struct can we determine what runqueue to lock.
 	 */
-	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
-
+	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		goto out_nounlock;
+	}
+	LOCK_TASK_CPU_RQ_IRQ_VERIFY(p);
 
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
-			
 	if (policy < 0)
 		policy = p->policy;
 	else {
@@ -962,16 +1332,35 @@
 		goto out_unlock;
 
 	retval = 0;
+
+	if ((p->policy == SCHED_OTHER) && (policy != SCHED_OTHER)) {
+		/*
+		 * If changing to a realtime policy, we lock the
+		 * realtime runqueue so that we can (potentially)
+		 * move p to it.
+		 *
+		 * Note: If we lock the realtime runqueue here, it
+		 * means that p is becoming a realtime task, and
+		 * UNLOCK_TASK_CPU_RQ_IRQ() will take care of unlocking
+		 * the realtime runqueue.
+		 */
+		LOCK_REALTIME_RQ();
+	}
+
+	if (task_on_runqueue(p)) {
+		del_from_runqueue_update(p);
+		was_on_rq = 1;
+	}
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
-	if (task_on_runqueue(p))
-		move_first_runqueue(p);
-
+	if (was_on_rq) {
+		add_to_runqueue(p);
+	}
 	current->need_resched = 1;
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
-	read_unlock_irq(&tasklist_lock);
+	UNLOCK_TASK_CPU_RQ_IRQ(p);
+	read_unlock(&tasklist_lock);
 
 out_nounlock:
 	return retval;
@@ -1048,19 +1437,18 @@
 	 * to be atomic.) In threaded applications this optimization
 	 * gets triggered quite often.
 	 */
-
-	int nr_pending = nr_running;
-
 #if CONFIG_SMP
+	int nr_pending = nt_running(REALTIME_RQ_ID);
 	int i;
 
-	// Subtract non-idle processes running on other CPUs.
+	// Substract non-idle processes running on other CPUs.
 	for (i = 0; i < smp_num_cpus; i++) {
 		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
+		nr_pending += (nt_running(cpu) - running_non_idle(cpu));
 	}
 #else
+	int nr_pending = nr_running();
+
 	// on UP this process is on the runqueue as well
 	nr_pending--;
 #endif
@@ -1240,6 +1628,7 @@
 void reparent_to_init(void)
 {
 	struct task_struct *this_task = current;
+	unsigned long old_policy;
 
 	write_lock_irq(&tasklist_lock);
 
@@ -1259,11 +1648,23 @@
 
 	/* We also take the runqueue_lock while altering task fields
 	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
+	LOCK_TASK_RQ_VERIFY(this_task);
 
 	this_task->ptrace = 0;
+	/*
+	 * If this is a RT task, we must move it off the
+	 * RT runqueue (and onto a CPU specific queue).
+	 */
+	old_policy = this_task->policy;
+	if (old_policy != SCHED_OTHER) {
+		del_from_runqueue(this_task);
+	}
 	this_task->nice = DEF_NICE;
 	this_task->policy = SCHED_OTHER;
+	if (old_policy != SCHED_OTHER) {
+		add_to_runqueue(this_task);
+	}
+
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
@@ -1274,7 +1675,7 @@
 	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
 	this_task->user = INIT_USER;
 
-	spin_unlock(&runqueue_lock);
+	UNLOCK_TASK_RQ(this_task);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -1337,6 +1738,14 @@
 	 */
 	int cpu = smp_processor_id();
 	int nr;
+
+	/*
+	 * Initialize the scheduling data structures
+	 */
+	for (nr = 0; nr < N_RUNQUEUES; nr++)
+		INIT_RUNQUEUE_DATA(nr);
+	for (nr = 0; nr < N_ALIGNED_DATA; nr++)
+		INIT_ALIGNED_DATA(nr);
 
 	init_task.processor = cpu;
 
diff -Naur linux-2.4.14/kernel/signal.c linux-2.4.14-mq/kernel/signal.c
--- linux-2.4.14/kernel/signal.c	Mon Sep 17 23:40:01 2001
+++ linux-2.4.14-mq/kernel/signal.c	Tue Nov  6 22:19:27 2001
@@ -478,10 +478,10 @@
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
+	LOCK_TASK_RQ_VERIFY(t);
 	if (t->has_cpu && t->processor != smp_processor_id())
 		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
+	UNLOCK_TASK_RQ(t);
 #endif /* CONFIG_SMP */
 
 	if (t->state & TASK_INTERRUPTIBLE) {
diff -Naur linux-2.4.14/kernel/timer.c linux-2.4.14-mq/kernel/timer.c
--- linux-2.4.14/kernel/timer.c	Mon Oct  8 17:41:41 2001
+++ linux-2.4.14-mq/kernel/timer.c	Tue Nov  6 22:19:27 2001
@@ -587,6 +587,11 @@
 			p->counter = 0;
 			p->need_resched = 1;
 		}
+#ifdef CONFIG_SMP
+		if (curr_na_goodness(cpu) != MIN_GOODNESS) {
+			curr_na_goodness(cpu) = na_goodness(p);
+		}
+#endif
 		if (p->nice > 0)
 			kstat.per_cpu_nice[cpu] += user_tick;
 		else