2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2008-10-06 13:23:43 +04:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
|
|
|
* Expects runqueue lock to be held for atomicity of update
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
|
|
|
|
{
|
|
|
|
if (rq) {
|
|
|
|
rq->rq_sched_info.run_delay += delta;
|
2007-10-15 17:00:12 +02:00
|
|
|
rq->rq_sched_info.pcount++;
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Expects runqueue lock to be held for atomicity of update
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
|
|
|
|
{
|
|
|
|
if (rq)
|
2008-12-16 23:41:22 -08:00
|
|
|
rq->rq_cpu_time += delta;
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
|
|
|
|
static inline void
|
|
|
|
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
|
|
|
|
{
|
|
|
|
if (rq)
|
|
|
|
rq->rq_sched_info.run_delay += delta;
|
|
|
|
}
|
2016-06-17 12:43:24 -05:00
|
|
|
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
|
|
|
|
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
|
|
|
|
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
|
|
|
|
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
|
2016-06-17 12:43:25 -05:00
|
|
|
#define schedstat_val(var) (var)
|
|
|
|
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
|
2016-06-03 17:58:40 -05:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
#else /* !CONFIG_SCHEDSTATS */
|
|
|
|
static inline void
|
|
|
|
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
|
|
|
|
{}
|
|
|
|
static inline void
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
|
|
|
|
{}
|
|
|
|
static inline void
|
2007-07-09 18:51:58 +02:00
|
|
|
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
|
|
|
|
{}
|
2016-06-17 12:43:24 -05:00
|
|
|
#define schedstat_enabled() 0
|
|
|
|
#define schedstat_inc(var) do { } while (0)
|
|
|
|
#define schedstat_add(var, amt) do { } while (0)
|
|
|
|
#define schedstat_set(var, val) do { } while (0)
|
|
|
|
#define schedstat_val(var) 0
|
2016-06-17 12:43:25 -05:00
|
|
|
#define schedstat_val_or_zero(var) 0
|
2016-06-17 12:43:24 -05:00
|
|
|
#endif /* CONFIG_SCHEDSTATS */
|
2007-07-09 18:51:58 +02:00
|
|
|
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
#ifdef CONFIG_PSI
|
|
|
|
/*
|
|
|
|
* PSI tracks state that persists across sleeps, such as iowaits and
|
|
|
|
* memory stalls. As a result, it has to distinguish between sleeps,
|
|
|
|
* where a task's runnable state changes, and requeues, where a task
|
|
|
|
* and its state are being moved between CPUs and runqueues.
|
|
|
|
*/
|
|
|
|
static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
|
|
|
{
|
|
|
|
int clear = 0, set = TSK_RUNNING;
|
|
|
|
|
2018-11-30 14:09:58 -08:00
|
|
|
if (static_branch_likely(&psi_disabled))
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (!wakeup || p->sched_psi_wake_requeue) {
|
2020-03-16 21:28:05 -04:00
|
|
|
if (p->in_memstall)
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
set |= TSK_MEMSTALL;
|
|
|
|
if (p->sched_psi_wake_requeue)
|
|
|
|
p->sched_psi_wake_requeue = 0;
|
|
|
|
} else {
|
|
|
|
if (p->in_iowait)
|
|
|
|
clear |= TSK_IOWAIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
psi_task_change(p, clear, set);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
|
|
|
{
|
|
|
|
int clear = TSK_RUNNING, set = 0;
|
|
|
|
|
2018-11-30 14:09:58 -08:00
|
|
|
if (static_branch_likely(&psi_disabled))
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (!sleep) {
|
2020-03-16 21:28:05 -04:00
|
|
|
if (p->in_memstall)
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
clear |= TSK_MEMSTALL;
|
|
|
|
} else {
|
|
|
|
if (p->in_iowait)
|
|
|
|
set |= TSK_IOWAIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
psi_task_change(p, clear, set);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void psi_ttwu_dequeue(struct task_struct *p)
|
|
|
|
{
|
2018-11-30 14:09:58 -08:00
|
|
|
if (static_branch_likely(&psi_disabled))
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* Is the task being migrated during a wakeup? Make sure to
|
|
|
|
* deregister its sleep-persistent psi states from the old
|
|
|
|
* queue, and let psi_enqueue() know it has to requeue.
|
|
|
|
*/
|
2020-03-16 21:28:05 -04:00
|
|
|
if (unlikely(p->in_iowait || p->in_memstall)) {
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
struct rq_flags rf;
|
|
|
|
struct rq *rq;
|
|
|
|
int clear = 0;
|
|
|
|
|
|
|
|
if (p->in_iowait)
|
|
|
|
clear |= TSK_IOWAIT;
|
2020-03-16 21:28:05 -04:00
|
|
|
if (p->in_memstall)
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
clear |= TSK_MEMSTALL;
|
|
|
|
|
|
|
|
rq = __task_rq_lock(p, &rf);
|
|
|
|
psi_task_change(p, clear, 0);
|
|
|
|
p->sched_psi_wake_requeue = 1;
|
|
|
|
__task_rq_unlock(rq, &rf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void psi_task_tick(struct rq *rq)
|
|
|
|
{
|
2018-11-30 14:09:58 -08:00
|
|
|
if (static_branch_likely(&psi_disabled))
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
return;
|
|
|
|
|
2020-03-16 21:28:05 -04:00
|
|
|
if (unlikely(rq->curr->in_memstall))
|
UPSTREAM: psi: pressure stall information for CPU, memory, and IO
When systems are overcommitted and resources become contended, it's hard
to tell exactly the impact this has on workload productivity, or how close
the system is to lockups and OOM kills. In particular, when machines work
multiple jobs concurrently, the impact of overcommit in terms of latency
and throughput on the individual job can be enormous.
In order to maximize hardware utilization without sacrificing individual
job health or risk complete machine lockups, this patch implements a way
to quantify resource pressure in the system.
A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that
expose the percentage of time the system is stalled on CPU, memory, or IO,
respectively. Stall states are aggregate versions of the per-task delay
accounting delays:
cpu: some tasks are runnable but not executing on a CPU
memory: tasks are reclaiming, or waiting for swapin or thrashing cache
io: tasks are waiting for io completions
These percentages of walltime can be thought of as pressure percentages,
and they give a general sense of system health and productivity loss
incurred by resource overcommit. They can also indicate when the system
is approaching lockup scenarios and OOMs.
To do this, psi keeps track of the task states associated with each CPU
and samples the time they spend in stall states. Every 2 seconds, the
samples are averaged across CPUs - weighted by the CPUs' non-idle time to
eliminate artifacts from unused CPUs - and translated into percentages of
walltime. A running average of those percentages is maintained over 10s,
1m, and 5m periods (similar to the loadaverage).
[hannes@cmpxchg.org: doc fixlet, per Randy]
Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org
[hannes@cmpxchg.org: code optimization]
Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org
[hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter]
Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org
[hannes@cmpxchg.org: fix build]
Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org
Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit eb414681d5a07d28d2ff90dc05f69ec6b232ebd2)
Bug: 111308141
Test: modified lmkd to use PSI and tested using lmkd_unit_test
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I54a65620b3ed6f8172fdec789a237a99f8c82156
2018-10-26 15:06:27 -07:00
|
|
|
psi_memstall_tick(rq->curr, cpu_of(rq));
|
|
|
|
}
|
|
|
|
#else /* CONFIG_PSI */
|
|
|
|
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
|
|
|
|
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
|
|
|
|
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
|
|
|
static inline void psi_task_tick(struct rq *rq) {}
|
|
|
|
#endif /* CONFIG_PSI */
|
|
|
|
|
2015-06-25 23:53:37 +05:30
|
|
|
#ifdef CONFIG_SCHED_INFO
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
static inline void sched_info_reset_dequeued(struct task_struct *t)
|
|
|
|
{
|
|
|
|
t->sched_info.last_queued = 0;
|
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
2010-10-24 16:28:47 +06:00
|
|
|
* We are interested in knowing how long it was from the *first* time a
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
* task was queued to the time that it finally hit a cpu, we call this routine
|
|
|
|
* from dequeue_task() to account for possible rq->clock skew across cpus. The
|
|
|
|
* delta taken on each cpu would annul the skew.
|
2007-07-09 18:51:58 +02:00
|
|
|
*/
|
2013-09-22 17:20:54 +03:00
|
|
|
static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2013-09-22 17:20:54 +03:00
|
|
|
unsigned long long now = rq_clock(rq), delta = 0;
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
|
|
|
|
if (unlikely(sched_info_on()))
|
|
|
|
if (t->sched_info.last_queued)
|
|
|
|
delta = now - t->sched_info.last_queued;
|
|
|
|
sched_info_reset_dequeued(t);
|
|
|
|
t->sched_info.run_delay += delta;
|
|
|
|
|
2013-09-22 17:20:54 +03:00
|
|
|
rq_sched_info_dequeued(rq, delta);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called when a task finally hits the cpu. We can now calculate how
|
|
|
|
* long it was waiting to run. We also note when it began so that we
|
|
|
|
* can keep stats on how long its timeslice is.
|
|
|
|
*/
|
2013-09-22 17:20:54 +03:00
|
|
|
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2013-09-22 17:20:54 +03:00
|
|
|
unsigned long long now = rq_clock(rq), delta = 0;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
if (t->sched_info.last_queued)
|
|
|
|
delta = now - t->sched_info.last_queued;
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
sched_info_reset_dequeued(t);
|
2007-07-09 18:51:58 +02:00
|
|
|
t->sched_info.run_delay += delta;
|
|
|
|
t->sched_info.last_arrival = now;
|
2007-10-15 17:00:12 +02:00
|
|
|
t->sched_info.pcount++;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2013-09-22 17:20:54 +03:00
|
|
|
rq_sched_info_arrive(rq, delta);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function is only called from enqueue_task(), but also only updates
|
|
|
|
* the timestamp if it is already not set. It's assumed that
|
|
|
|
* sched_info_dequeued() will clear that stamp when appropriate.
|
|
|
|
*/
|
2013-09-22 17:20:54 +03:00
|
|
|
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
|
|
|
if (unlikely(sched_info_on()))
|
|
|
|
if (!t->sched_info.last_queued)
|
2013-09-22 17:20:54 +03:00
|
|
|
t->sched_info.last_queued = rq_clock(rq);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-09-16 11:30:36 +03:00
|
|
|
* Called when a process ceases being the active-running process involuntarily
|
|
|
|
* due, typically, to expiring its time slice (this may also be called when
|
|
|
|
* switching to the idle task). Now we can calculate how long we ran.
|
2008-06-16 15:11:01 +05:30
|
|
|
* Also, if the process is still in the TASK_RUNNING state, call
|
|
|
|
* sched_info_queued() to mark that it has now again started waiting on
|
|
|
|
* the runqueue.
|
2007-07-09 18:51:58 +02:00
|
|
|
*/
|
2013-09-22 17:20:54 +03:00
|
|
|
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2013-09-22 17:20:54 +03:00
|
|
|
unsigned long long delta = rq_clock(rq) -
|
2007-11-09 22:39:37 +01:00
|
|
|
t->sched_info.last_arrival;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2013-09-22 17:20:54 +03:00
|
|
|
rq_sched_info_depart(rq, delta);
|
2008-06-16 15:11:01 +05:30
|
|
|
|
|
|
|
if (t->state == TASK_RUNNING)
|
2013-09-22 17:20:54 +03:00
|
|
|
sched_info_queued(rq, t);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called when tasks are switched involuntarily due, typically, to expiring
|
|
|
|
* their time slice. (This may also be called when switching to or from
|
|
|
|
* the idle task.) We are only called when prev != next.
|
|
|
|
*/
|
|
|
|
static inline void
|
2013-09-22 17:20:54 +03:00
|
|
|
__sched_info_switch(struct rq *rq,
|
|
|
|
struct task_struct *prev, struct task_struct *next)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* prev now departs the cpu. It's not interesting to record
|
|
|
|
* stats about how efficient we were at scheduling the idle
|
|
|
|
* process, however.
|
|
|
|
*/
|
|
|
|
if (prev != rq->idle)
|
2013-09-22 17:20:54 +03:00
|
|
|
sched_info_depart(rq, prev);
|
2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
if (next != rq->idle)
|
2013-09-22 17:20:54 +03:00
|
|
|
sched_info_arrive(rq, next);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
static inline void
|
2013-09-22 17:20:54 +03:00
|
|
|
sched_info_switch(struct rq *rq,
|
|
|
|
struct task_struct *prev, struct task_struct *next)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
|
|
|
if (unlikely(sched_info_on()))
|
2013-09-22 17:20:54 +03:00
|
|
|
__sched_info_switch(rq, prev, next);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
#else
|
2013-09-22 17:20:54 +03:00
|
|
|
#define sched_info_queued(rq, t) do { } while (0)
|
sched: fix accounting in task delay accounting & migration
On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
> cpu0 cpu1
>
> enqueue
> <wait-d1>
> dequeue
> enqueue
> <wait-d2>
> run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>
Thanks for reviewing the patch. The above is exactly what I have done.
> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.
Fixed. Pl. find the new patch below.
Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-01 14:30:06 +05:30
|
|
|
#define sched_info_reset_dequeued(t) do { } while (0)
|
2013-09-22 17:20:54 +03:00
|
|
|
#define sched_info_dequeued(rq, t) do { } while (0)
|
|
|
|
#define sched_info_depart(rq, t) do { } while (0)
|
|
|
|
#define sched_info_arrive(rq, next) do { } while (0)
|
|
|
|
#define sched_info_switch(rq, t, next) do { } while (0)
|
2015-06-25 23:53:37 +05:30
|
|
|
#endif /* CONFIG_SCHED_INFO */
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2008-09-12 09:54:39 -07:00
|
|
|
/*
|
|
|
|
* The following are functions that support scheduler-internal time accounting.
|
|
|
|
* These functions are generally called at the timer tick. None of this depends
|
|
|
|
* on CONFIG_SCHEDSTATS.
|
|
|
|
*/
|
|
|
|
|
2013-05-26 17:35:41 -04:00
|
|
|
/**
|
|
|
|
* cputimer_running - return true if cputimer is running
|
|
|
|
*
|
|
|
|
* @tsk: Pointer to target task.
|
|
|
|
*/
|
|
|
|
static inline bool cputimer_running(struct task_struct *tsk)
|
|
|
|
|
|
|
|
{
|
|
|
|
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
|
|
|
|
2015-04-28 13:00:22 -07:00
|
|
|
/* Check if cputimer isn't running. This is accessed without locking. */
|
|
|
|
if (!READ_ONCE(cputimer->running))
|
2013-05-26 17:35:41 -04:00
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
|
|
|
|
* in __exit_signal(), we won't account to the signal struct further
|
|
|
|
* cputime consumed by that task, even though the task can still be
|
|
|
|
* ticking after __exit_signal().
|
|
|
|
*
|
|
|
|
* In order to keep a consistent behaviour between thread group cputime
|
|
|
|
* and thread group cputimer accounting, lets also ignore the cputime
|
|
|
|
* elapsing after __exit_signal() in any thread group timer running.
|
|
|
|
*
|
|
|
|
* This makes sure that POSIX CPU clocks and timers are synchronized, so
|
|
|
|
* that a POSIX CPU timer won't expire while the corresponding POSIX CPU
|
|
|
|
* clock delta is behind the expiring timer value.
|
|
|
|
*/
|
|
|
|
if (unlikely(!tsk->sighand))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2008-09-12 09:54:39 -07:00
|
|
|
/**
|
2008-09-12 09:54:39 -07:00
|
|
|
* account_group_user_time - Maintain utime for a thread group.
|
2008-09-12 09:54:39 -07:00
|
|
|
*
|
2008-09-12 09:54:39 -07:00
|
|
|
* @tsk: Pointer to task structure.
|
|
|
|
* @cputime: Time value by which to increment the utime field of the
|
|
|
|
* thread_group_cputime structure.
|
2008-09-12 09:54:39 -07:00
|
|
|
*
|
|
|
|
* If thread group time is being maintained, get the structure for the
|
|
|
|
* running CPU and update the utime field there.
|
|
|
|
*/
|
2008-09-12 09:54:39 -07:00
|
|
|
static inline void account_group_user_time(struct task_struct *tsk,
|
|
|
|
cputime_t cputime)
|
2008-09-12 09:54:39 -07:00
|
|
|
{
|
2010-06-11 01:09:52 +02:00
|
|
|
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
2008-09-12 09:54:39 -07:00
|
|
|
|
2013-05-26 17:35:41 -04:00
|
|
|
if (!cputimer_running(tsk))
|
2009-02-05 12:24:16 +01:00
|
|
|
return;
|
|
|
|
|
2015-04-28 13:00:24 -07:00
|
|
|
atomic64_add(cputime, &cputimer->cputime_atomic.utime);
|
2008-09-12 09:54:39 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2008-09-12 09:54:39 -07:00
|
|
|
* account_group_system_time - Maintain stime for a thread group.
|
2008-09-12 09:54:39 -07:00
|
|
|
*
|
2008-09-12 09:54:39 -07:00
|
|
|
* @tsk: Pointer to task structure.
|
|
|
|
* @cputime: Time value by which to increment the stime field of the
|
|
|
|
* thread_group_cputime structure.
|
2008-09-12 09:54:39 -07:00
|
|
|
*
|
|
|
|
* If thread group time is being maintained, get the structure for the
|
|
|
|
* running CPU and update the stime field there.
|
|
|
|
*/
|
2008-09-12 09:54:39 -07:00
|
|
|
static inline void account_group_system_time(struct task_struct *tsk,
|
|
|
|
cputime_t cputime)
|
2008-09-12 09:54:39 -07:00
|
|
|
{
|
2010-06-11 01:09:52 +02:00
|
|
|
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
2009-02-05 12:24:16 +01:00
|
|
|
|
2013-05-26 17:35:41 -04:00
|
|
|
if (!cputimer_running(tsk))
|
2009-02-05 12:24:16 +01:00
|
|
|
return;
|
2008-09-12 09:54:39 -07:00
|
|
|
|
2015-04-28 13:00:24 -07:00
|
|
|
atomic64_add(cputime, &cputimer->cputime_atomic.stime);
|
2008-09-12 09:54:39 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2008-09-12 09:54:39 -07:00
|
|
|
* account_group_exec_runtime - Maintain exec runtime for a thread group.
|
2008-09-12 09:54:39 -07:00
|
|
|
*
|
2008-09-12 09:54:39 -07:00
|
|
|
* @tsk: Pointer to task structure.
|
2008-09-12 09:54:39 -07:00
|
|
|
* @ns: Time value by which to increment the sum_exec_runtime field
|
2008-09-12 09:54:39 -07:00
|
|
|
* of the thread_group_cputime structure.
|
2008-09-12 09:54:39 -07:00
|
|
|
*
|
|
|
|
* If thread group time is being maintained, get the structure for the
|
|
|
|
* running CPU and update the sum_exec_runtime field there.
|
|
|
|
*/
|
2008-09-12 09:54:39 -07:00
|
|
|
static inline void account_group_exec_runtime(struct task_struct *tsk,
|
|
|
|
unsigned long long ns)
|
2008-09-12 09:54:39 -07:00
|
|
|
{
|
2010-06-11 01:09:52 +02:00
|
|
|
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
|
2009-02-05 12:24:16 +01:00
|
|
|
|
2013-05-26 17:35:41 -04:00
|
|
|
if (!cputimer_running(tsk))
|
2009-02-05 12:24:16 +01:00
|
|
|
return;
|
2008-09-12 09:54:39 -07:00
|
|
|
|
2015-04-28 13:00:24 -07:00
|
|
|
atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
|
2008-09-12 09:54:39 -07:00
|
|
|
}
|