Latest version of the scary patches... --
Since we now have p->on_cpu unconditionally available, use it to
re-implement mutex_spin_on_owner.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/mutex.h | 2 -
include/linux/sched.h | 2 -
kernel/mutex-debug.c | 2 -
kernel/mutex-debug.h | 2 -
kernel/mutex.c | 2 -
kernel/mutex.h | 2 -
kernel/sched.c | 83 +++++++++++++++++++-------------------------------
7 files changed, 39 insertions(+), 56 deletions(-)
Index: linux-2.6/include/linux/mutex.h
===================================================================
--- linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -51,7 +51,7 @@ struct mutex {
spinlock_t wait_lock;
struct list_head wait_list;
#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
- struct thread_info *owner;
+ struct task_struct *owner;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
Index: linux-2.6/kernel/mutex-debug.c
===================================================================
--- linux-2.6.orig/kernel/mutex-debug.c
+++ linux-2.6/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lo
return;
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
mutex_clear_owner(lock);
}
Index: linux-2.6/kernel/mutex-debug.h
===================================================================
--- linux-2.6.orig/kernel/mutex-debug.h
+++ linux-2.6/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mute
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current_thread_info();
+ lock->owner = current;
}
static inline void mutex_clear_owner(struct mutex *lock)
Index: linux-2.6/kernel/mutex.c
===================================================================
--- ...In order to call ttwu_stat() without holding rq->lock we must remove
its rq argument. Since we need to change rq stats, account to the
local rq instead of the task rq, this is safe since we have IRQs
disabled.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2367,10 +2367,11 @@ static void update_avg(u64 *avg, u64 sam
#endif
static void
-ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
#ifdef CONFIG_SCHEDSTATS
int this_cpu = smp_processor_id();
+ struct rq *rq = this_rq();
schedstat_inc(rq, ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups);
@@ -2491,9 +2492,10 @@ try_to_wake_up(struct task_struct *p, un
activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
out_running:
ttwu_post_activation(p, rq, wake_flags);
- ttwu_stat(rq, p, cpu, wake_flags);
success = 1;
__task_rq_unlock(rq);
+
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
put_cpu();
@@ -2527,7 +2529,7 @@ static void try_to_wake_up_local(struct
activate_task(rq, p, ENQUEUE_WAKEUP);
ttwu_post_activation(p, rq, 0);
- ttwu_stat(rq, p, smp_processor_id(), 0);
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
--
Now that we've removed the rq->lock requirement from the first part of ttwu() and can compute placement without holding any rq->lock, ensure we execute the second half of ttwu() on the actual cpu we want the task to run on. This avoids having to take rq->lock and doing the task enqueue remotely, saving lots on cacheline transfers. As measured using: http://oss.oracle.com/~mason/sembench.c $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 639476 worker burns per second patched: run time 30 seconds 847526 worker burns per second Still need to sort out all the races marked XXX (non-trivial), and its x86 only for the moment. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> --- arch/x86/kernel/smp.c | 1 include/linux/sched.h | 2 kernel/sched.c | 139 +++++++++++++++++++++++++++++++++++++----------- kernel/sched_features.h | 2 4 files changed, 113 insertions(+), 31 deletions(-) Index: linux-2.6/arch/x86/kernel/smp.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/smp.c +++ linux-2.6/arch/x86/kernel/smp.c @@ -205,6 +205,7 @@ void smp_reschedule_interrupt(struct pt_ /* * KVM uses this interrupt to force a cpu out of guest mode */ + sched_ttwu_pending(); } void smp_call_function_interrupt(struct pt_regs *regs) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -1020,6 +1020,7 @@ partition_sched_domains(int ndoms_new, c } #endif /* !CONFIG_SMP */ +void sched_ttwu_pending(void); struct io_context; /* See blkdev.h */ @@ -1201,6 +1202,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP + struct task_struct *wake_entry; int on_cpu; #endif int on_rq; Index: ...
Provide a generic p->on_rq because the p->se.on_rq semantics are
unfavourable for lockless wakeups but needed for sched_fair.
In particular, p->on_rq is only cleared when we actually dequeue the
task in schedule() and not on any random dequeue as done by things
like __migrate_task() and __sched_setscheduler().
This also allows us to remove p->se usage from !sched_fair code.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 1 +
kernel/sched.c | 36 ++++++++++++++++++------------------
kernel/sched_debug.c | 2 +-
kernel/sched_rt.c | 10 +++++-----
kernel/sched_stoptask.c | 2 +-
5 files changed, 26 insertions(+), 25 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1200,6 +1200,7 @@ struct task_struct {
#ifdef CONFIG_SMP
int on_cpu;
#endif
+ int on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1759,7 +1759,6 @@ static void enqueue_task(struct rq *rq,
update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
- p->se.on_rq = 1;
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1767,7 +1766,6 @@ static void dequeue_task(struct rq *rq,
update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
- p->se.on_rq = 0;
}
/*
@@ -1780,6 +1778,7 @@ static void activate_task(struct rq *rq,
enqueue_task(rq, p, flags);
inc_nr_running(rq);
+ p->on_rq = 1;
}
/*
@@ -2070,7 +2069,7 @@ static void check_preempt_curr(struct rq
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back ...On hot-unplug flush the pending wakeup queue by selecting a new rq for
each of them and requeueing them appropriately.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2526,6 +2526,29 @@ static void ttwu_queue(struct task_struc
raw_spin_unlock(&rq->lock);
}
+#ifdef CONFIG_HOTPLUG_CPU
+static void ttwu_queue_unplug(struct rq *rq)
+{
+ struct task_struct *p, *list = xchg(&rq->wake_list, NULL);
+ unsigned long flags;
+ int cpu;
+
+ if (!list)
+ return;
+
+ while (list) {
+ p = list;
+ list = list->wake_entry;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, 0);
+ set_task_cpu(p, cpu);
+ ttwu_queue(p, cpu);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ }
+}
+#endif
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -6151,6 +6174,11 @@ migration_call(struct notifier_block *nf
migrate_nr_uninterruptible(rq);
calc_global_load_remove(rq);
break;
+
+ case CPU_DEAD:
+ ttwu_queue_unplug(cpu_rq(cpu));
+ break;
+
#endif
}
return NOTIFY_OK;
--
The ttwu_post_actication() does the core wakeup, it sets TASK_RUNNING
and performs wakeup-preemption, so give is a more descriptive name.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2399,8 +2399,11 @@ ttwu_stat(struct task_struct *p, int cpu
#endif /* CONFIG_SCHEDSTATS */
}
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
static void
-ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
@@ -2492,7 +2495,7 @@ try_to_wake_up(struct task_struct *p, un
activate_task(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
out_running:
- ttwu_post_activation(p, rq, wake_flags);
+ ttwu_do_wakeup(rq, p, wake_flags);
success = 1;
__task_rq_unlock(rq);
@@ -2529,7 +2532,7 @@ static void try_to_wake_up_local(struct
if (!p->on_rq)
activate_task(rq, p, ENQUEUE_WAKEUP);
- ttwu_post_activation(p, rq, 0);
+ ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
--
Since we now serialize ttwu() using p->pi_lock, we also need to
serialize ttwu_local() using that, otherwise, once we drop the
rq->lock from ttwu() it can race with ttwu_local().
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2513,9 +2513,9 @@ static int try_to_wake_up(struct task_st
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not alredy there. The caller must
+ * Put @p on the run-queue if it's not alredy there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task. this_rq() stays locked over invocation.
+ * the current task.
*/
static void try_to_wake_up_local(struct task_struct *p)
{
@@ -2523,16 +2523,21 @@ static void try_to_wake_up_local(struct
BUG_ON(rq != this_rq());
BUG_ON(p == current);
- lockdep_assert_held(&rq->lock);
+
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
if (!(p->state & TASK_NORMAL))
- return;
+ goto out;
if (!p->on_rq)
activate_task(rq, p, ENQUEUE_WAKEUP);
ttwu_post_activation(p, rq, 0);
ttwu_stat(rq, p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
}
/**
@@ -3925,6 +3930,7 @@ pick_next_task(struct rq *rq)
*/
asmlinkage void __sched schedule(void)
{
+ struct task_struct *to_wakeup = NULL;
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
@@ -3958,21 +3964,21 @@ asmlinkage void __sched schedule(void)
* task to maintain concurrency. If so, wake
* up the task.
*/
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
-
+ if ...In preparation of calling select_task_rq() without rq->lock held, drop
the dependency on the rq argument.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 3 +--
kernel/sched.c | 20 +++++++++++---------
kernel/sched_fair.c | 2 +-
kernel/sched_idletask.c | 2 +-
kernel/sched_rt.c | 10 +++++++++-
kernel/sched_stoptask.c | 3 +--
6 files changed, 24 insertions(+), 16 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1063,8 +1063,7 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct rq *rq, struct task_struct *p,
- int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2138,13 +2138,15 @@ static int migration_cpu_stop(void *data
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static bool migrate_task(struct task_struct *p, struct rq *rq)
+static bool need_migrate_task(struct task_struct *p)
{
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
*/
- return p->on_rq || task_running(rq, p);
+ bool running = p->on_rq || p->on_cpu;
+ smp_rmb(); /* finish_lock_switch() */
+ return running;
}
/*
@@ -2337,9 +2339,9 @@ static int select_fallback_rq(int cpu, s
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct rq *rq, struct ...In preparation of calling this without rq->lock held, remove the
dependency on the rq argument.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 10 +++++++---
kernel/sched.c | 2 +-
kernel/sched_fair.c | 4 +++-
3 files changed, 11 insertions(+), 5 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1045,8 +1045,12 @@ struct sched_domain;
#define WF_FORK 0x02 /* child wakeup after fork */
#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_WAKING 2
-#define ENQUEUE_HEAD 4
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
#define DEQUEUE_SLEEP 1
@@ -1067,7 +1071,7 @@ struct sched_class {
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
- void (*task_waking) (struct rq *this_rq, struct task_struct *task);
+ void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2481,7 +2481,7 @@ static int try_to_wake_up(struct task_st
p->state = TASK_WAKING;
if (p->sched_class->task_waking) {
- p->sched_class->task_waking(rq, p);
+ p->sched_class->task_waking(p);
en_flags |= ENQUEUE_WAKING;
}
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1338,11 +1338,13 @@ static void yield_task_fair(struct rq *r
#ifdef CONFIG_SMP
-static void task_waking_fair(struct rq *rq, ...Currently ttwu() does two rq->lock acquisitions, once on the task's
old rq, holding it over the p->state fiddling and load-balance pass.
Then it drops the old rq->lock to acquire the new rq->lock.
By having serialized ttwu(), p->sched_class, p->cpus_allowed with
p->pi_lock, we can now drop the whole first rq->lock acquisition.
The p->pi_lock serializing concurrent ttwu() calls protects p->state,
which we will set to TASK_WAKING to bridge possible p->pi_lock to
rq->lock gaps and serialize set_task_cpu() calls against
task_rq_lock().
The p->pi_lock serialization of p->sched_class allows us to call
scheduling class methods without holding the rq->lock, and the
serialization of p->cpus_allowed allows us to do the load-balancing
bits without races.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 47 +++++++++++++++++++----------------------------
kernel/sched_fair.c | 3 +--
2 files changed, 20 insertions(+), 30 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2436,69 +2436,60 @@ ttwu_post_activation(struct task_struct
* Returns %true if @p was woken up, %false if it was already running
* or @state didn't match @p's state.
*/
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- int cpu, orig_cpu, this_cpu, success = 0;
+ int cpu, this_cpu, success = 0;
unsigned long flags;
- unsigned long en_flags = ENQUEUE_WAKEUP;
struct rq *rq;
this_cpu = get_cpu();
smp_wmb();
raw_spin_lock_irqsave(&p->pi_lock, flags);
- rq = __task_rq_lock(p);
if (!(p->state & state))
goto out;
cpu = task_cpu(p);
- if (p->on_rq)
- goto out_running;
+ if (p->on_rq) {
+ rq = __task_rq_lock(p);
+ if (p->on_rq)
+ goto ...try_to_wake_up() would only return a success when it would have to
place a task on a rq, change that to every time we change p->state to
TASK_RUNNING, because that's the real measure of wakeups.
This results in that success is always true for the tracepoints.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2383,10 +2383,10 @@ static inline void ttwu_activate(struct
activate_task(rq, p, en_flags);
}
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- int wake_flags, bool success)
+static void
+ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags)
{
- trace_sched_wakeup(p, success);
+ trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -2406,7 +2406,7 @@ static inline void ttwu_post_activation(
}
#endif
/* if a worker is waking up, notify workqueue */
- if ((p->flags & PF_WQ_WORKER) && success)
+ if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}
@@ -2505,9 +2505,9 @@ static int try_to_wake_up(struct task_st
#endif /* CONFIG_SMP */
ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
cpu == this_cpu, en_flags);
- success = 1;
out_running:
- ttwu_post_activation(p, rq, wake_flags, success);
+ ttwu_post_activation(p, rq, wake_flags);
+ success = 1;
out:
task_rq_unlock(rq, &flags);
put_cpu();
@@ -2526,7 +2526,6 @@ static int try_to_wake_up(struct task_st
static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- bool success = false;
BUG_ON(rq != this_rq());
BUG_ON(p == current);
@@ -2541,9 +2540,8 @@ static void try_to_wake_up_local(struct
schedstat_inc(rq, ttwu_local);
}
...Collect all ttwu stat code into a single function and ensure its
always called for an actual wakeup (changing p->state to
TASK_RUNNING).
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 67 +++++++++++++++++++++++++++------------------------------
1 file changed, 32 insertions(+), 35 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2366,21 +2366,36 @@ static void update_avg(u64 *avg, u64 sam
}
#endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync, bool is_migrate, bool is_local,
- unsigned long en_flags)
+static void
+ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags)
{
+#ifdef CONFIG_SCHEDSTATS
+ int this_cpu = smp_processor_id();
+
+ schedstat_inc(rq, ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups);
- if (is_sync)
+
+ if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (is_migrate)
+
+ if (cpu != task_cpu(p))
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (is_local)
+
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ } else {
+ struct sched_domain *sd;
- activate_task(rq, p, en_flags);
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ }
+#endif /* CONFIG_SCHEDSTATS */
}
static void
@@ -2440,12 +2455,12 @@ static int try_to_wake_up(struct task_st
if (!(p->state & state))
goto out;
+ cpu = task_cpu(p);
+
if (p->se.on_rq)
goto out_running;
- cpu = task_cpu(p);
orig_cpu = cpu;
-
#ifdef CONFIG_SMP
if (unlikely(task_running(rq, p)))
goto ...Currently p->pi_lock already serializes p->sched_class, also put
p->cpus_allowed and try_to_wake_up() under it, this prepares the way
to do the first part of ttwu() without holding rq->lock.
By having p->sched_class and p->cpus_allowed serialized by p->pi_lock,
we prepare the way to call select_task_rq() without holding rq->lock.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
kernel/sched.c | 37 ++++++++++++++++---------------------
1 file changed, 16 insertions(+), 21 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -2301,7 +2301,7 @@ void task_oncpu_function_call(struct tas
#ifdef CONFIG_SMP
/*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -2334,7 +2334,7 @@ static int select_fallback_rq(int cpu, s
}
/*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
@@ -2450,7 +2450,8 @@ static int try_to_wake_up(struct task_st
this_cpu = get_cpu();
smp_wmb();
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
if (!(p->state & state))
goto out;
@@ -2508,7 +2509,8 @@ static int try_to_wake_up(struct task_st
ttwu_stat(rq, p, cpu, wake_flags);
success = 1;
out:
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
put_cpu();
return success;
@@ -4543,6 +4545,8 @@ void rt_mutex_setprio(struct task_struct
BUG_ON(prio < 0 || prio > MAX_PRIO);
+ lockdep_assert_held(&p->pi_lock);
+
rq = task_rq_lock(p, &flags);
...Always provide p->on_cpu so that we can determine if its on a cpu
without having to lock the rq.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 4 +---
kernel/sched.c | 46 +++++++++++++++++++++++++++++-----------------
2 files changed, 30 insertions(+), 20 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -845,18 +845,39 @@ static inline int task_current(struct rq
return rq->curr == p;
}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
return task_current(rq, p);
+#endif
}
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
@@ -872,15 +893,6 @@ static inline void finish_lock_switch(st
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- return p->oncpu;
-#else
- return task_current(rq, p);
-#endif
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
@@ -889,7 +901,7 @@ static inline void prepare_lock_switch(s
...Just to make it easier for others to see the rationale behind this series, see the numbers and limitations listed below. Ingo -------------------------> As measured using: http://oss.oracle.com/~mason/sembench.c $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 639476 worker burns per second patched: run time 30 seconds 847526 worker burns per second Still need to sort out all the races marked XXX (non-trivial), and its x86 only for the moment. --
