Annotation of sys/kern/sched_bsd.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: sched_bsd.c,v 1.12 2007/05/18 16:10:15 art Exp $ */
2: /* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */
3:
4: /*-
5: * Copyright (c) 1982, 1986, 1990, 1991, 1993
6: * The Regents of the University of California. All rights reserved.
7: * (c) UNIX System Laboratories, Inc.
8: * All or some portions of this file are derived from material licensed
9: * to the University of California by American Telephone and Telegraph
10: * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11: * the permission of UNIX System Laboratories, Inc.
12: *
13: * Redistribution and use in source and binary forms, with or without
14: * modification, are permitted provided that the following conditions
15: * are met:
16: * 1. Redistributions of source code must retain the above copyright
17: * notice, this list of conditions and the following disclaimer.
18: * 2. Redistributions in binary form must reproduce the above copyright
19: * notice, this list of conditions and the following disclaimer in the
20: * documentation and/or other materials provided with the distribution.
21: * 3. Neither the name of the University nor the names of its contributors
22: * may be used to endorse or promote products derived from this software
23: * without specific prior written permission.
24: *
25: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35: * SUCH DAMAGE.
36: *
37: * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94
38: */
39:
40: #include <sys/param.h>
41: #include <sys/systm.h>
42: #include <sys/proc.h>
43: #include <sys/kernel.h>
44: #include <sys/buf.h>
45: #include <sys/signalvar.h>
46: #include <sys/resourcevar.h>
47: #include <uvm/uvm_extern.h>
48: #include <sys/sched.h>
49: #include <sys/timeout.h>
50:
51: #ifdef KTRACE
52: #include <sys/ktrace.h>
53: #endif
54:
55: #include <machine/cpu.h>
56:
57: int lbolt; /* once a second sleep address */
58: int rrticks_init; /* # of hardclock ticks per roundrobin() */
59:
60: int whichqs; /* Bit mask summary of non-empty Q's. */
61: struct prochd qs[NQS];
62:
63: struct SIMPLELOCK sched_lock;
64:
65: void scheduler_start(void);
66:
67: void roundrobin(struct cpu_info *);
68: void schedcpu(void *);
69: void updatepri(struct proc *);
70: void endtsleep(void *);
71:
72: void
73: scheduler_start(void)
74: {
75: static struct timeout schedcpu_to;
76:
77: /*
78: * We avoid polluting the global namespace by keeping the scheduler
79: * timeouts static in this function.
80: * We setup the timeouts here and kick schedcpu and roundrobin once to
81: * make them do their job.
82: */
83:
84: timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
85:
86: rrticks_init = hz / 10;
87: schedcpu(&schedcpu_to);
88: }
89:
90: /*
91: * Force switch among equal priority processes every 100ms.
92: */
93: /* ARGSUSED */
94: void
95: roundrobin(struct cpu_info *ci)
96: {
97: struct schedstate_percpu *spc = &ci->ci_schedstate;
98: int s;
99:
100: spc->spc_rrticks = rrticks_init;
101:
102: if (curproc != NULL) {
103: s = splstatclock();
104: if (spc->spc_schedflags & SPCF_SEENRR) {
105: /*
106: * The process has already been through a roundrobin
107: * without switching and may be hogging the CPU.
108: * Indicate that the process should yield.
109: */
110: spc->spc_schedflags |= SPCF_SHOULDYIELD;
111: } else {
112: spc->spc_schedflags |= SPCF_SEENRR;
113: }
114: splx(s);
115: }
116:
117: need_resched(curcpu());
118: }
119:
120: /*
121: * Constants for digital decay and forget:
122: * 90% of (p_estcpu) usage in 5 * loadav time
123: * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
124: * Note that, as ps(1) mentions, this can let percentages
125: * total over 100% (I've seen 137.9% for 3 processes).
126: *
127: * Note that hardclock updates p_estcpu and p_cpticks independently.
128: *
129: * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
130: * That is, the system wants to compute a value of decay such
131: * that the following for loop:
132: * for (i = 0; i < (5 * loadavg); i++)
133: * p_estcpu *= decay;
134: * will compute
135: * p_estcpu *= 0.1;
136: * for all values of loadavg:
137: *
138: * Mathematically this loop can be expressed by saying:
139: * decay ** (5 * loadavg) ~= .1
140: *
141: * The system computes decay as:
142: * decay = (2 * loadavg) / (2 * loadavg + 1)
143: *
144: * We wish to prove that the system's computation of decay
145: * will always fulfill the equation:
146: * decay ** (5 * loadavg) ~= .1
147: *
148: * If we compute b as:
149: * b = 2 * loadavg
150: * then
151: * decay = b / (b + 1)
152: *
153: * We now need to prove two things:
154: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
155: * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
156: *
157: * Facts:
158: * For x close to zero, exp(x) =~ 1 + x, since
159: * exp(x) = 0! + x**1/1! + x**2/2! + ... .
160: * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
161: * For x close to zero, ln(1+x) =~ x, since
162: * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
163: * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
164: * ln(.1) =~ -2.30
165: *
166: * Proof of (1):
167: * Solve (factor)**(power) =~ .1 given power (5*loadav):
168: * solving for factor,
169: * ln(factor) =~ (-2.30/5*loadav), or
170: * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
171: * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
172: *
173: * Proof of (2):
174: * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
175: * solving for power,
176: * power*ln(b/(b+1)) =~ -2.30, or
177: * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
178: *
179: * Actual power values for the implemented algorithm are as follows:
180: * loadav: 1 2 3 4
181: * power: 5.68 10.32 14.94 19.55
182: */
183:
184: /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
185: #define loadfactor(loadav) (2 * (loadav))
186: #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
187:
188: /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
189: fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
190:
191: /*
192: * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
193: * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
194: * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
195: *
196: * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
197: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
198: *
199: * If you don't want to bother with the faster/more-accurate formula, you
200: * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
201: * (more general) method of calculating the %age of CPU used by a process.
202: */
203: #define CCPU_SHIFT 11
204:
205: /*
206: * Recompute process priorities, every hz ticks.
207: */
208: /* ARGSUSED */
209: void
210: schedcpu(void *arg)
211: {
212: struct timeout *to = (struct timeout *)arg;
213: fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
214: struct proc *p;
215: int s;
216: unsigned int newcpu;
217: int phz;
218:
219: /*
220: * If we have a statistics clock, use that to calculate CPU
221: * time, otherwise revert to using the profiling clock (which,
222: * in turn, defaults to hz if there is no separate profiling
223: * clock available)
224: */
225: phz = stathz ? stathz : profhz;
226: KASSERT(phz);
227:
228: for (p = LIST_FIRST(&allproc); p != NULL; p = LIST_NEXT(p, p_list)) {
229: /*
230: * Increment time in/out of memory and sleep time
231: * (if sleeping). We ignore overflow; with 16-bit int's
232: * (remember them?) overflow takes 45 days.
233: */
234: p->p_swtime++;
235: if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
236: p->p_slptime++;
237: p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
238: /*
239: * If the process has slept the entire second,
240: * stop recalculating its priority until it wakes up.
241: */
242: if (p->p_slptime > 1)
243: continue;
244: SCHED_LOCK(s);
245: /*
246: * p_pctcpu is only for ps.
247: */
248: #if (FSHIFT >= CCPU_SHIFT)
249: p->p_pctcpu += (phz == 100)?
250: ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
251: 100 * (((fixpt_t) p->p_cpticks)
252: << (FSHIFT - CCPU_SHIFT)) / phz;
253: #else
254: p->p_pctcpu += ((FSCALE - ccpu) *
255: (p->p_cpticks * FSCALE / phz)) >> FSHIFT;
256: #endif
257: p->p_cpticks = 0;
258: newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
259: p->p_estcpu = newcpu;
260: resetpriority(p);
261: if (p->p_priority >= PUSER) {
262: if ((p != curproc) &&
263: p->p_stat == SRUN &&
264: (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
265: remrunqueue(p);
266: p->p_priority = p->p_usrpri;
267: setrunqueue(p);
268: } else
269: p->p_priority = p->p_usrpri;
270: }
271: SCHED_UNLOCK(s);
272: }
273: uvm_meter();
274: wakeup(&lbolt);
275: timeout_add(to, hz);
276: }
277:
278: /*
279: * Recalculate the priority of a process after it has slept for a while.
280: * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
281: * least six times the loadfactor will decay p_estcpu to zero.
282: */
283: void
284: updatepri(struct proc *p)
285: {
286: unsigned int newcpu = p->p_estcpu;
287: fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
288:
289: SCHED_ASSERT_LOCKED();
290:
291: if (p->p_slptime > 5 * loadfac)
292: p->p_estcpu = 0;
293: else {
294: p->p_slptime--; /* the first time was done in schedcpu */
295: while (newcpu && --p->p_slptime)
296: newcpu = (int) decay_cpu(loadfac, newcpu);
297: p->p_estcpu = newcpu;
298: }
299: resetpriority(p);
300: }
301:
302: #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
303: void
304: sched_unlock_idle(void)
305: {
306: SIMPLE_UNLOCK(&sched_lock);
307: }
308:
309: void
310: sched_lock_idle(void)
311: {
312: SIMPLE_LOCK(&sched_lock);
313: }
314: #endif /* MULTIPROCESSOR || LOCKDEBUG */
315:
316: /*
317: * General yield call. Puts the current process back on its run queue and
318: * performs a voluntary context switch.
319: */
320: void
321: yield(void)
322: {
323: struct proc *p = curproc;
324: int s;
325:
326: SCHED_LOCK(s);
327: p->p_priority = p->p_usrpri;
328: p->p_stat = SRUN;
329: setrunqueue(p);
330: p->p_stats->p_ru.ru_nvcsw++;
331: mi_switch();
332: SCHED_UNLOCK(s);
333: }
334:
335: /*
336: * General preemption call. Puts the current process back on its run queue
337: * and performs an involuntary context switch. If a process is supplied,
338: * we switch to that process. Otherwise, we use the normal process selection
339: * criteria.
340: */
341: void
342: preempt(struct proc *newp)
343: {
344: struct proc *p = curproc;
345: int s;
346:
347: /*
348: * XXX Switching to a specific process is not supported yet.
349: */
350: if (newp != NULL)
351: panic("preempt: cpu_preempt not yet implemented");
352:
353: SCHED_LOCK(s);
354: p->p_priority = p->p_usrpri;
355: p->p_stat = SRUN;
356: setrunqueue(p);
357: p->p_stats->p_ru.ru_nivcsw++;
358: mi_switch();
359: SCHED_UNLOCK(s);
360: }
361:
362:
363: /*
364: * Must be called at splstatclock() or higher.
365: */
366: void
367: mi_switch(void)
368: {
369: struct proc *p = curproc; /* XXX */
370: struct rlimit *rlim;
371: struct timeval tv;
372: #if defined(MULTIPROCESSOR)
373: int hold_count;
374: int sched_count;
375: #endif
376: struct schedstate_percpu *spc = &p->p_cpu->ci_schedstate;
377:
378: SCHED_ASSERT_LOCKED();
379:
380: #if defined(MULTIPROCESSOR)
381: /*
382: * Release the kernel_lock, as we are about to yield the CPU.
383: * The scheduler lock is still held until cpu_switch()
384: * selects a new process and removes it from the run queue.
385: */
386: sched_count = __mp_release_all_but_one(&sched_lock);
387: if (p->p_flag & P_BIGLOCK)
388: hold_count = __mp_release_all(&kernel_lock);
389: #endif
390:
391: /*
392: * Compute the amount of time during which the current
393: * process was running, and add that to its total so far.
394: * XXX - use microuptime here to avoid strangeness.
395: */
396: microuptime(&tv);
397: if (timercmp(&tv, &spc->spc_runtime, <)) {
398: #if 0
399: printf("uptime is not monotonic! "
400: "tv=%lu.%06lu, runtime=%lu.%06lu\n",
401: tv.tv_sec, tv.tv_usec, spc->spc_runtime.tv_sec,
402: spc->spc_runtime.tv_usec);
403: #endif
404: } else {
405: timersub(&tv, &spc->spc_runtime, &tv);
406: timeradd(&p->p_rtime, &tv, &p->p_rtime);
407: }
408:
409: /*
410: * Check if the process exceeds its cpu resource allocation.
411: * If over max, kill it.
412: */
413: rlim = &p->p_rlimit[RLIMIT_CPU];
414: if ((rlim_t)p->p_rtime.tv_sec >= rlim->rlim_cur) {
415: if ((rlim_t)p->p_rtime.tv_sec >= rlim->rlim_max) {
416: psignal(p, SIGKILL);
417: } else {
418: psignal(p, SIGXCPU);
419: if (rlim->rlim_cur < rlim->rlim_max)
420: rlim->rlim_cur += 5;
421: }
422: }
423:
424: /*
425: * Process is about to yield the CPU; clear the appropriate
426: * scheduling flags.
427: */
428: spc->spc_schedflags &= ~SPCF_SWITCHCLEAR;
429:
430: /*
431: * Pick a new current process and record its start time.
432: */
433: uvmexp.swtch++;
434: cpu_switch(p);
435:
436: /*
437: * Make sure that MD code released the scheduler lock before
438: * resuming us.
439: */
440: SCHED_ASSERT_UNLOCKED();
441:
442: /*
443: * We're running again; record our new start time. We might
444: * be running on a new CPU now, so don't use the cache'd
445: * schedstate_percpu pointer.
446: */
447: KDASSERT(p->p_cpu != NULL);
448: KDASSERT(p->p_cpu == curcpu());
449: microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
450:
451: #if defined(MULTIPROCESSOR)
452: /*
453: * Reacquire the kernel_lock now. We do this after we've
454: * released the scheduler lock to avoid deadlock, and before
455: * we reacquire the interlock and the scheduler lock.
456: */
457: if (p->p_flag & P_BIGLOCK)
458: __mp_acquire_count(&kernel_lock, hold_count);
459: __mp_acquire_count(&sched_lock, sched_count + 1);
460: #endif
461: }
462:
463: /*
464: * Initialize the (doubly-linked) run queues
465: * to be empty.
466: */
467: void
468: rqinit(void)
469: {
470: int i;
471:
472: for (i = 0; i < NQS; i++)
473: qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
474: SIMPLE_LOCK_INIT(&sched_lock);
475: }
476:
477: static __inline void
478: resched_proc(struct proc *p, u_char pri)
479: {
480: struct cpu_info *ci;
481:
482: /*
483: * XXXSMP
484: * Since p->p_cpu persists across a context switch,
485: * this gives us *very weak* processor affinity, in
486: * that we notify the CPU on which the process last
487: * ran that it should try to switch.
488: *
489: * This does not guarantee that the process will run on
490: * that processor next, because another processor might
491: * grab it the next time it performs a context switch.
492: *
493: * This also does not handle the case where its last
494: * CPU is running a higher-priority process, but every
495: * other CPU is running a lower-priority process. There
496: * are ways to handle this situation, but they're not
497: * currently very pretty, and we also need to weigh the
498: * cost of moving a process from one CPU to another.
499: *
500: * XXXSMP
501: * There is also the issue of locking the other CPU's
502: * sched state, which we currently do not do.
503: */
504: ci = (p->p_cpu != NULL) ? p->p_cpu : curcpu();
505: if (pri < ci->ci_schedstate.spc_curpriority)
506: need_resched(ci);
507: }
508:
509: /*
510: * Change process state to be runnable,
511: * placing it on the run queue if it is in memory,
512: * and awakening the swapper if it isn't in memory.
513: */
514: void
515: setrunnable(struct proc *p)
516: {
517: SCHED_ASSERT_LOCKED();
518:
519: switch (p->p_stat) {
520: case 0:
521: case SRUN:
522: case SONPROC:
523: case SZOMB:
524: case SDEAD:
525: default:
526: panic("setrunnable");
527: case SSTOP:
528: /*
529: * If we're being traced (possibly because someone attached us
530: * while we were stopped), check for a signal from the debugger.
531: */
532: if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0)
533: atomic_setbits_int(&p->p_siglist, sigmask(p->p_xstat));
534: case SSLEEP:
535: unsleep(p); /* e.g. when sending signals */
536: break;
537: case SIDL:
538: break;
539: }
540: p->p_stat = SRUN;
541: setrunqueue(p);
542: if (p->p_slptime > 1)
543: updatepri(p);
544: p->p_slptime = 0;
545: resched_proc(p, p->p_priority);
546: }
547:
548: /*
549: * Compute the priority of a process when running in user mode.
550: * Arrange to reschedule if the resulting priority is better
551: * than that of the current process.
552: */
553: void
554: resetpriority(struct proc *p)
555: {
556: unsigned int newpriority;
557:
558: SCHED_ASSERT_LOCKED();
559:
560: newpriority = PUSER + p->p_estcpu + NICE_WEIGHT * (p->p_nice - NZERO);
561: newpriority = min(newpriority, MAXPRI);
562: p->p_usrpri = newpriority;
563: resched_proc(p, p->p_usrpri);
564: }
565:
566: /*
567: * We adjust the priority of the current process. The priority of a process
568: * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu)
569: * is increased here. The formula for computing priorities (in kern_synch.c)
570: * will compute a different value each time p_estcpu increases. This can
571: * cause a switch, but unless the priority crosses a PPQ boundary the actual
572: * queue will not change. The cpu usage estimator ramps up quite quickly
573: * when the process is running (linearly), and decays away exponentially, at
574: * a rate which is proportionally slower when the system is busy. The basic
575: * principle is that the system will 90% forget that the process used a lot
576: * of CPU time in 5 * loadav seconds. This causes the system to favor
577: * processes which haven't run much recently, and to round-robin among other
578: * processes.
579: */
580:
581: void
582: schedclock(struct proc *p)
583: {
584: int s;
585:
586: SCHED_LOCK(s);
587: p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
588: resetpriority(p);
589: if (p->p_priority >= PUSER)
590: p->p_priority = p->p_usrpri;
591: SCHED_UNLOCK(s);
592: }
CVSweb