Annotation of sys/kern/vfs_sync.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: vfs_sync.c,v 1.43 2007/06/01 23:47:56 deraadt Exp $ */
2:
3: /*
4: * Portions of this code are:
5: *
6: * Copyright (c) 1989, 1993
7: * The Regents of the University of California. All rights reserved.
8: * (c) UNIX System Laboratories, Inc.
9: * All or some portions of this file are derived from material licensed
10: * to the University of California by American Telephone and Telegraph
11: * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12: * the permission of UNIX System Laboratories, Inc.
13: *
14: * Redistribution and use in source and binary forms, with or without
15: * modification, are permitted provided that the following conditions
16: * are met:
17: * 1. Redistributions of source code must retain the above copyright
18: * notice, this list of conditions and the following disclaimer.
19: * 2. Redistributions in binary form must reproduce the above copyright
20: * notice, this list of conditions and the following disclaimer in the
21: * documentation and/or other materials provided with the distribution.
22: * 3. Neither the name of the University nor the names of its contributors
23: * may be used to endorse or promote products derived from this software
24: * without specific prior written permission.
25: *
26: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36: * SUCH DAMAGE.
37: */
38:
39: /*
40: * Syncer daemon
41: */
42:
43: #include <sys/queue.h>
44: #include <sys/param.h>
45: #include <sys/systm.h>
46: #include <sys/proc.h>
47: #include <sys/mount.h>
48: #include <sys/vnode.h>
49: #include <sys/buf.h>
50: #include <sys/malloc.h>
51:
52: #include <sys/kernel.h>
53: #include <sys/sched.h>
54:
55: #ifdef FFS_SOFTUPDATES
56: int softdep_process_worklist(struct mount *);
57: #endif
58:
59: /*
60: * The workitem queue.
61: */
62: #define SYNCER_MAXDELAY 32 /* maximum sync delay time */
63: #define SYNCER_DEFAULT 30 /* default sync delay time */
64: int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
65: time_t syncdelay = SYNCER_DEFAULT; /* time to delay syncing vnodes */
66:
67: int rushjob = 0; /* number of slots to run ASAP */
68: int stat_rush_requests = 0; /* number of rush requests */
69:
70: static int syncer_delayno = 0;
71: static long syncer_mask;
72: LIST_HEAD(synclist, vnode);
73: static struct synclist *syncer_workitem_pending;
74:
75: struct proc *syncerproc;
76:
77: /*
78: * The workitem queue.
79: *
80: * It is useful to delay writes of file data and filesystem metadata
81: * for tens of seconds so that quickly created and deleted files need
82: * not waste disk bandwidth being created and removed. To realize this,
83: * we append vnodes to a "workitem" queue. When running with a soft
84: * updates implementation, most pending metadata dependencies should
85: * not wait for more than a few seconds. Thus, mounted block devices
86: * are delayed only about half the time that file data is delayed.
87: * Similarly, directory updates are more critical, so are only delayed
88: * about a third the time that file data is delayed. Thus, there are
89: * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
90: * one each second (driven off the filesystem syncer process). The
91: * syncer_delayno variable indicates the next queue that is to be processed.
92: * Items that need to be processed soon are placed in this queue:
93: *
94: * syncer_workitem_pending[syncer_delayno]
95: *
96: * A delay of fifteen seconds is done by placing the request fifteen
97: * entries later in the queue:
98: *
99: * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
100: *
101: */
102:
103: void
104: vn_initialize_syncerd(void)
105: {
106: syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, M_WAITOK,
107: &syncer_mask);
108: syncer_maxdelay = syncer_mask + 1;
109: }
110:
111: /*
112: * Add an item to the syncer work queue.
113: */
114: void
115: vn_syncer_add_to_worklist(struct vnode *vp, int delay)
116: {
117: int s, slot;
118:
119: if (delay > syncer_maxdelay - 2)
120: delay = syncer_maxdelay - 2;
121: slot = (syncer_delayno + delay) & syncer_mask;
122:
123: s = splbio();
124: if (vp->v_bioflag & VBIOONSYNCLIST)
125: LIST_REMOVE(vp, v_synclist);
126:
127: vp->v_bioflag |= VBIOONSYNCLIST;
128: LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
129: splx(s);
130: }
131:
132: /*
133: * System filesystem synchronizer daemon.
134: */
135: void
136: sched_sync(struct proc *p)
137: {
138: struct synclist *slp;
139: struct vnode *vp;
140: long starttime;
141: int s;
142:
143: syncerproc = curproc;
144:
145: for (;;) {
146: starttime = time_second;
147:
148: /*
149: * Push files whose dirty time has expired.
150: */
151: s = splbio();
152: slp = &syncer_workitem_pending[syncer_delayno];
153:
154: syncer_delayno += 1;
155: if (syncer_delayno == syncer_maxdelay)
156: syncer_delayno = 0;
157:
158: while ((vp = LIST_FIRST(slp)) != NULL) {
159: if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT, p)) {
160: /*
161: * If we fail to get the lock, we move this
162: * vnode one second ahead in time.
163: * XXX - no good, but the best we can do.
164: */
165: vn_syncer_add_to_worklist(vp, 1);
166: continue;
167: }
168: splx(s);
169: (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
170: vput(vp);
171: s = splbio();
172: if (LIST_FIRST(slp) == vp) {
173: /*
174: * Note: disk vps can remain on the
175: * worklist too with no dirty blocks, but
176: * since sync_fsync() moves it to a different
177: * slot we are safe.
178: */
179: #ifdef DIAGNOSTIC
180: if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
181: vp->v_type != VBLK) {
182: vprint("fsync failed", vp);
183: if (vp->v_mount != NULL)
184: printf("mounted on: %s\n",
185: vp->v_mount->mnt_stat.f_mntonname);
186: panic("sched_sync: fsync failed");
187: }
188: #endif /* DIAGNOSTIC */
189: /*
190: * Put us back on the worklist. The worklist
191: * routine will remove us from our current
192: * position and then add us back in at a later
193: * position.
194: */
195: vn_syncer_add_to_worklist(vp, syncdelay);
196: }
197: }
198:
199: splx(s);
200:
201: #ifdef FFS_SOFTUPDATES
202: /*
203: * Do soft update processing.
204: */
205: softdep_process_worklist(NULL);
206: #endif
207:
208: /*
209: * The variable rushjob allows the kernel to speed up the
210: * processing of the filesystem syncer process. A rushjob
211: * value of N tells the filesystem syncer to process the next
212: * N seconds worth of work on its queue ASAP. Currently rushjob
213: * is used by the soft update code to speed up the filesystem
214: * syncer process when the incore state is getting so far
215: * ahead of the disk that the kernel memory pool is being
216: * threatened with exhaustion.
217: */
218: if (rushjob > 0) {
219: rushjob -= 1;
220: continue;
221: }
222: /*
223: * If it has taken us less than a second to process the
224: * current work, then wait. Otherwise start right over
225: * again. We can still lose time if any single round
226: * takes more than two seconds, but it does not really
227: * matter as we are just trying to generally pace the
228: * filesystem activity.
229: */
230: if (time_second == starttime)
231: tsleep(&lbolt, PPAUSE, "syncer", 0);
232: }
233: }
234:
235: /*
236: * Request the syncer daemon to speed up its work.
237: * We never push it to speed up more than half of its
238: * normal turn time, otherwise it could take over the cpu.
239: */
240: int
241: speedup_syncer(void)
242: {
243: int s;
244:
245: SCHED_LOCK(s);
246: if (syncerproc && syncerproc->p_wchan == &lbolt)
247: setrunnable(syncerproc);
248: SCHED_UNLOCK(s);
249: if (rushjob < syncdelay / 2) {
250: rushjob += 1;
251: stat_rush_requests += 1;
252: return 1;
253: }
254: return 0;
255: }
256:
257: /*
258: * Routine to create and manage a filesystem syncer vnode.
259: */
260: #define sync_close nullop
261: int sync_fsync(void *);
262: int sync_inactive(void *);
263: #define sync_reclaim nullop
264: #define sync_lock vop_generic_lock
265: #define sync_unlock vop_generic_unlock
266: int sync_print(void *);
267: #define sync_islocked vop_generic_islocked
268:
269: int (**sync_vnodeop_p)(void *);
270: struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
271: { &vop_default_desc, vn_default_error },
272: { &vop_close_desc, sync_close }, /* close */
273: { &vop_fsync_desc, sync_fsync }, /* fsync */
274: { &vop_inactive_desc, sync_inactive }, /* inactive */
275: { &vop_reclaim_desc, sync_reclaim }, /* reclaim */
276: { &vop_lock_desc, sync_lock }, /* lock */
277: { &vop_unlock_desc, sync_unlock }, /* unlock */
278: { &vop_print_desc, sync_print }, /* print */
279: { &vop_islocked_desc, sync_islocked }, /* islocked */
280: { (struct vnodeop_desc*)NULL, (int(*)(void *))NULL }
281: };
282: struct vnodeopv_desc sync_vnodeop_opv_desc = {
283: &sync_vnodeop_p, sync_vnodeop_entries
284: };
285:
286: /*
287: * Create a new filesystem syncer vnode for the specified mount point.
288: */
289: int
290: vfs_allocate_syncvnode(struct mount *mp)
291: {
292: struct vnode *vp;
293: static long start, incr, next;
294: int error;
295:
296: /* Allocate a new vnode */
297: if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
298: mp->mnt_syncer = NULL;
299: return (error);
300: }
301: vp->v_writecount = 1;
302: vp->v_type = VNON;
303: /*
304: * Place the vnode onto the syncer worklist. We attempt to
305: * scatter them about on the list so that they will go off
306: * at evenly distributed times even if all the filesystems
307: * are mounted at once.
308: */
309: next += incr;
310: if (next == 0 || next > syncer_maxdelay) {
311: start /= 2;
312: incr /= 2;
313: if (start == 0) {
314: start = syncer_maxdelay / 2;
315: incr = syncer_maxdelay;
316: }
317: next = start;
318: }
319: vn_syncer_add_to_worklist(vp, next);
320: mp->mnt_syncer = vp;
321: return (0);
322: }
323:
324: /*
325: * Do a lazy sync of the filesystem.
326: */
327: int
328: sync_fsync(void *v)
329: {
330: struct vop_fsync_args *ap = v;
331: struct vnode *syncvp = ap->a_vp;
332: struct mount *mp = syncvp->v_mount;
333: int asyncflag;
334:
335: /*
336: * We only need to do something if this is a lazy evaluation.
337: */
338: if (ap->a_waitfor != MNT_LAZY)
339: return (0);
340:
341: /*
342: * Move ourselves to the back of the sync list.
343: */
344: vn_syncer_add_to_worklist(syncvp, syncdelay);
345:
346: /*
347: * Walk the list of vnodes pushing all that are dirty and
348: * not already on the sync list.
349: */
350: if (vfs_busy(mp, VB_READ|VB_NOWAIT) == 0) {
351: asyncflag = mp->mnt_flag & MNT_ASYNC;
352: mp->mnt_flag &= ~MNT_ASYNC;
353: VFS_SYNC(mp, MNT_LAZY, ap->a_cred, ap->a_p);
354: if (asyncflag)
355: mp->mnt_flag |= MNT_ASYNC;
356: vfs_unbusy(mp);
357: }
358:
359: return (0);
360: }
361:
362: /*
363: * The syncer vnode is no longer needed and is being decommissioned.
364: */
365: int
366: sync_inactive(void *v)
367: {
368: struct vop_inactive_args *ap = v;
369:
370: struct vnode *vp = ap->a_vp;
371: int s;
372:
373: if (vp->v_usecount == 0) {
374: VOP_UNLOCK(vp, 0, ap->a_p);
375: return (0);
376: }
377:
378: vp->v_mount->mnt_syncer = NULL;
379:
380: s = splbio();
381:
382: LIST_REMOVE(vp, v_synclist);
383: vp->v_bioflag &= ~VBIOONSYNCLIST;
384:
385: splx(s);
386:
387: vp->v_writecount = 0;
388: vput(vp);
389:
390: return (0);
391: }
392:
393: /*
394: * Print out a syncer vnode.
395: */
396: int
397: sync_print(void *v)
398: {
399: printf("syncer vnode\n");
400:
401: return (0);
402: }
CVSweb