Annotation of sys/ufs/ffs/ffs_softdep.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: ffs_softdep.c,v 1.92 2007/07/11 15:32:22 millert Exp $ */
2:
3: /*
4: * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
5: *
6: * The soft updates code is derived from the appendix of a University
7: * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8: * "Soft Updates: A Solution to the Metadata Update Problem in File
9: * Systems", CSE-TR-254-95, August 1995).
10: *
11: * Further information about soft updates can be obtained from:
12: *
13: * Marshall Kirk McKusick http://www.mckusick.com/softdep/
14: * 1614 Oxford Street mckusick@mckusick.com
15: * Berkeley, CA 94709-1608 +1-510-843-9542
16: * USA
17: *
18: * Redistribution and use in source and binary forms, with or without
19: * modification, are permitted provided that the following conditions
20: * are met:
21: *
22: * 1. Redistributions of source code must retain the above copyright
23: * notice, this list of conditions and the following disclaimer.
24: * 2. Redistributions in binary form must reproduce the above copyright
25: * notice, this list of conditions and the following disclaimer in the
26: * documentation and/or other materials provided with the distribution.
27: *
28: * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
29: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
30: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31: * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
32: * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38: * SUCH DAMAGE.
39: *
40: * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
41: * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.86 2001/02/04 16:08:18 phk Exp $
42: */
43:
44: #include <sys/param.h>
45: #include <sys/buf.h>
46: #include <sys/kernel.h>
47: #include <sys/malloc.h>
48: #include <sys/mount.h>
49: #include <sys/proc.h>
50: #include <sys/pool.h>
51: #include <sys/syslog.h>
52: #include <sys/systm.h>
53: #include <sys/vnode.h>
54: #include <miscfs/specfs/specdev.h>
55: #include <ufs/ufs/dir.h>
56: #include <ufs/ufs/quota.h>
57: #include <ufs/ufs/inode.h>
58: #include <ufs/ufs/ufsmount.h>
59: #include <ufs/ffs/fs.h>
60: #include <ufs/ffs/softdep.h>
61: #include <ufs/ffs/ffs_extern.h>
62: #include <ufs/ufs/ufs_extern.h>
63:
64: #define STATIC
65:
66: /*
67: * Mapping of dependency structure types to malloc types.
68: */
69: #define D_PAGEDEP 0
70: #define D_INODEDEP 1
71: #define D_NEWBLK 2
72: #define D_BMSAFEMAP 3
73: #define D_ALLOCDIRECT 4
74: #define D_INDIRDEP 5
75: #define D_ALLOCINDIR 6
76: #define D_FREEFRAG 7
77: #define D_FREEBLKS 8
78: #define D_FREEFILE 9
79: #define D_DIRADD 10
80: #define D_MKDIR 11
81: #define D_DIRREM 12
82: #define D_NEWDIRBLK 13
83: #define D_LAST 13
84: /*
85: * Names of softdep types.
86: */
87: const char *softdep_typenames[] = {
88: "pagedep",
89: "inodedep",
90: "newblk",
91: "bmsafemap",
92: "allocdirect",
93: "indirdep",
94: "allocindir",
95: "freefrag",
96: "freeblks",
97: "freefile",
98: "diradd",
99: "mkdir",
100: "dirrem",
101: "newdirblk",
102: };
103: #define TYPENAME(type) \
104: ((unsigned)(type) <= D_LAST ? softdep_typenames[type] : "???")
105: /*
106: * Finding the current process.
107: */
108: #define CURPROC curproc
109: /*
110: * End system adaptation definitions.
111: */
112:
113: /*
114: * Internal function prototypes.
115: */
116: STATIC void softdep_error(char *, int);
117: STATIC void drain_output(struct vnode *, int);
118: STATIC int getdirtybuf(struct buf *, int);
119: STATIC void clear_remove(struct proc *);
120: STATIC void clear_inodedeps(struct proc *);
121: STATIC int flush_pagedep_deps(struct vnode *, struct mount *,
122: struct diraddhd *);
123: STATIC int flush_inodedep_deps(struct fs *, ino_t);
124: STATIC int handle_written_filepage(struct pagedep *, struct buf *);
125: STATIC void diradd_inode_written(struct diradd *, struct inodedep *);
126: STATIC int handle_written_inodeblock(struct inodedep *, struct buf *);
127: STATIC void handle_allocdirect_partdone(struct allocdirect *);
128: STATIC void handle_allocindir_partdone(struct allocindir *);
129: STATIC void initiate_write_filepage(struct pagedep *, struct buf *);
130: STATIC void handle_written_mkdir(struct mkdir *, int);
131: STATIC void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
132: #ifdef FFS2
133: STATIC void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
134: #endif
135: STATIC void handle_workitem_freefile(struct freefile *);
136: STATIC void handle_workitem_remove(struct dirrem *);
137: STATIC struct dirrem *newdirrem(struct buf *, struct inode *,
138: struct inode *, int, struct dirrem **);
139: STATIC void free_diradd(struct diradd *);
140: STATIC void free_allocindir(struct allocindir *, struct inodedep *);
141: STATIC void free_newdirblk(struct newdirblk *);
142: STATIC int indir_trunc(struct inode *, daddr_t, int, daddr64_t, long *);
143: STATIC void deallocate_dependencies(struct buf *, struct inodedep *);
144: STATIC void free_allocdirect(struct allocdirectlst *,
145: struct allocdirect *, int);
146: STATIC int check_inode_unwritten(struct inodedep *);
147: STATIC int free_inodedep(struct inodedep *);
148: STATIC void handle_workitem_freeblocks(struct freeblks *);
149: STATIC void merge_inode_lists(struct inodedep *);
150: STATIC void setup_allocindir_phase2(struct buf *, struct inode *,
151: struct allocindir *);
152: STATIC struct allocindir *newallocindir(struct inode *, int, daddr_t,
153: daddr_t);
154: STATIC void handle_workitem_freefrag(struct freefrag *);
155: STATIC struct freefrag *newfreefrag(struct inode *, daddr_t, long);
156: STATIC void allocdirect_merge(struct allocdirectlst *,
157: struct allocdirect *, struct allocdirect *);
158: STATIC struct bmsafemap *bmsafemap_lookup(struct buf *);
159: STATIC int newblk_lookup(struct fs *, daddr_t, int,
160: struct newblk **);
161: STATIC int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
162: STATIC int pagedep_lookup(struct inode *, daddr64_t, int, struct pagedep **);
163: STATIC void pause_timer(void *);
164: STATIC int request_cleanup(int, int);
165: STATIC int process_worklist_item(struct mount *, int);
166: STATIC void add_to_worklist(struct worklist *);
167:
168: /*
169: * Exported softdep operations.
170: */
171: void softdep_disk_io_initiation(struct buf *);
172: void softdep_disk_write_complete(struct buf *);
173: void softdep_deallocate_dependencies(struct buf *);
174: void softdep_move_dependencies(struct buf *, struct buf *);
175: int softdep_count_dependencies(struct buf *bp, int, int);
176:
177: /*
178: * Locking primitives.
179: *
180: * For a uniprocessor, all we need to do is protect against disk
181: * interrupts. For a multiprocessor, this lock would have to be
182: * a mutex. A single mutex is used throughout this file, though
183: * finer grain locking could be used if contention warranted it.
184: *
185: * For a multiprocessor, the sleep call would accept a lock and
186: * release it after the sleep processing was complete. In a uniprocessor
187: * implementation there is no such interlock, so we simple mark
188: * the places where it needs to be done with the `interlocked' form
189: * of the lock calls. Since the uniprocessor sleep already interlocks
190: * the spl, there is nothing that really needs to be done.
191: */
192: #ifndef /* NOT */ DEBUG
193: STATIC struct lockit {
194: int lkt_spl;
195: } lk = { 0 };
196: #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
197: #define FREE_LOCK(lk) splx((lk)->lkt_spl)
198: #define ACQUIRE_LOCK_INTERLOCKED(lk,s) (lk)->lkt_spl = (s)
199: #define FREE_LOCK_INTERLOCKED(lk) ((lk)->lkt_spl)
200:
201: #else /* DEBUG */
202: STATIC struct lockit {
203: int lkt_spl;
204: pid_t lkt_held;
205: int lkt_line;
206: } lk = { 0, -1 };
207: STATIC int lockcnt;
208:
209: STATIC void acquire_lock(struct lockit *, int);
210: STATIC void free_lock(struct lockit *, int);
211: STATIC void acquire_lock_interlocked(struct lockit *, int, int);
212: STATIC int free_lock_interlocked(struct lockit *, int);
213:
214: #define ACQUIRE_LOCK(lk) acquire_lock(lk, __LINE__)
215: #define FREE_LOCK(lk) free_lock(lk, __LINE__)
216: #define ACQUIRE_LOCK_INTERLOCKED(lk,s) acquire_lock_interlocked(lk, (s), __LINE__)
217: #define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk, __LINE__)
218:
219: STATIC void
220: acquire_lock(lk, line)
221: struct lockit *lk;
222: int line;
223: {
224: pid_t holder;
225: int original_line;
226:
227: if (lk->lkt_held != -1) {
228: holder = lk->lkt_held;
229: original_line = lk->lkt_line;
230: FREE_LOCK(lk);
231: if (holder == CURPROC->p_pid)
232: panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
233: else
234: panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
235: }
236: lk->lkt_spl = splbio();
237: lk->lkt_held = CURPROC->p_pid;
238: lk->lkt_line = line;
239: lockcnt++;
240: }
241:
242: STATIC void
243: free_lock(lk, line)
244: struct lockit *lk;
245: int line;
246: {
247:
248: if (lk->lkt_held == -1)
249: panic("softdep_unlock: lock not held at line %d", line);
250: lk->lkt_held = -1;
251: splx(lk->lkt_spl);
252: }
253:
254: STATIC void
255: acquire_lock_interlocked(lk, s, line)
256: struct lockit *lk;
257: int s;
258: int line;
259: {
260: pid_t holder;
261: int original_line;
262:
263: if (lk->lkt_held != -1) {
264: holder = lk->lkt_held;
265: original_line = lk->lkt_line;
266: FREE_LOCK_INTERLOCKED(lk);
267: if (holder == CURPROC->p_pid)
268: panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
269: else
270: panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
271: }
272: lk->lkt_held = CURPROC->p_pid;
273: lk->lkt_line = line;
274: lk->lkt_spl = s;
275: lockcnt++;
276: }
277:
278: STATIC int
279: free_lock_interlocked(lk, line)
280: struct lockit *lk;
281: int line;
282: {
283:
284: if (lk->lkt_held == -1)
285: panic("softdep_unlock_interlocked: lock not held at line %d", line);
286: lk->lkt_held = -1;
287:
288: return (lk->lkt_spl);
289: }
290: #endif /* DEBUG */
291:
292: /*
293: * Place holder for real semaphores.
294: */
295: struct sema {
296: int value;
297: pid_t holder;
298: char *name;
299: int prio;
300: int timo;
301: };
302: STATIC void sema_init(struct sema *, char *, int, int);
303: STATIC int sema_get(struct sema *, struct lockit *);
304: STATIC void sema_release(struct sema *);
305:
306: STATIC void
307: sema_init(semap, name, prio, timo)
308: struct sema *semap;
309: char *name;
310: int prio, timo;
311: {
312:
313: semap->holder = -1;
314: semap->value = 0;
315: semap->name = name;
316: semap->prio = prio;
317: semap->timo = timo;
318: }
319:
320: STATIC int
321: sema_get(semap, interlock)
322: struct sema *semap;
323: struct lockit *interlock;
324: {
325: int s;
326:
327: if (semap->value++ > 0) {
328: if (interlock != NULL)
329: s = FREE_LOCK_INTERLOCKED(interlock);
330: tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
331: if (interlock != NULL) {
332: ACQUIRE_LOCK_INTERLOCKED(interlock, s);
333: FREE_LOCK(interlock);
334: }
335: return (0);
336: }
337: semap->holder = CURPROC->p_pid;
338: if (interlock != NULL)
339: FREE_LOCK(interlock);
340: return (1);
341: }
342:
343: STATIC void
344: sema_release(semap)
345: struct sema *semap;
346: {
347:
348: if (semap->value <= 0 || semap->holder != CURPROC->p_pid) {
349: #ifdef DEBUG
350: if (lk.lkt_held != -1)
351: FREE_LOCK(&lk);
352: #endif
353: panic("sema_release: not held");
354: }
355: if (--semap->value > 0) {
356: semap->value = 0;
357: wakeup(semap);
358: }
359: semap->holder = -1;
360: }
361:
362: /*
363: * Memory management.
364: */
365: STATIC struct pool pagedep_pool;
366: STATIC struct pool inodedep_pool;
367: STATIC struct pool newblk_pool;
368: STATIC struct pool bmsafemap_pool;
369: STATIC struct pool allocdirect_pool;
370: STATIC struct pool indirdep_pool;
371: STATIC struct pool allocindir_pool;
372: STATIC struct pool freefrag_pool;
373: STATIC struct pool freeblks_pool;
374: STATIC struct pool freefile_pool;
375: STATIC struct pool diradd_pool;
376: STATIC struct pool mkdir_pool;
377: STATIC struct pool dirrem_pool;
378: STATIC struct pool newdirblk_pool;
379:
380: static __inline void
381: softdep_free(struct worklist *item, int type)
382: {
383:
384: switch (type) {
385: case D_PAGEDEP:
386: pool_put(&pagedep_pool, item);
387: break;
388:
389: case D_INODEDEP:
390: pool_put(&inodedep_pool, item);
391: break;
392:
393: case D_BMSAFEMAP:
394: pool_put(&bmsafemap_pool, item);
395: break;
396:
397: case D_ALLOCDIRECT:
398: pool_put(&allocdirect_pool, item);
399: break;
400:
401: case D_INDIRDEP:
402: pool_put(&indirdep_pool, item);
403: break;
404:
405: case D_ALLOCINDIR:
406: pool_put(&allocindir_pool, item);
407: break;
408:
409: case D_FREEFRAG:
410: pool_put(&freefrag_pool, item);
411: break;
412:
413: case D_FREEBLKS:
414: pool_put(&freeblks_pool, item);
415: break;
416:
417: case D_FREEFILE:
418: pool_put(&freefile_pool, item);
419: break;
420:
421: case D_DIRADD:
422: pool_put(&diradd_pool, item);
423: break;
424:
425: case D_MKDIR:
426: pool_put(&mkdir_pool, item);
427: break;
428:
429: case D_DIRREM:
430: pool_put(&dirrem_pool, item);
431: break;
432:
433: case D_NEWDIRBLK:
434: pool_put(&newdirblk_pool, item);
435: break;
436:
437: default:
438: #ifdef DEBUG
439: if (lk.lkt_held != -1)
440: FREE_LOCK(&lk);
441: #endif
442: panic("softdep_free: unknown type %d", type);
443: }
444: }
445:
446: struct workhead softdep_freequeue;
447:
448: static __inline void
449: softdep_freequeue_add(struct worklist *item)
450: {
451: int s;
452:
453: s = splbio();
454: LIST_INSERT_HEAD(&softdep_freequeue, item, wk_list);
455: splx(s);
456: }
457:
458: static __inline void
459: softdep_freequeue_process(void)
460: {
461: struct worklist *wk;
462:
463: splassert(IPL_BIO);
464:
465: while ((wk = LIST_FIRST(&softdep_freequeue)) != NULL) {
466: LIST_REMOVE(wk, wk_list);
467: FREE_LOCK(&lk);
468: softdep_free(wk, wk->wk_type);
469: ACQUIRE_LOCK(&lk);
470: }
471: }
472:
473: /*
474: * Worklist queue management.
475: * These routines require that the lock be held.
476: */
477: #ifndef /* NOT */ DEBUG
478: #define WORKLIST_INSERT(head, item) do { \
479: (item)->wk_state |= ONWORKLIST; \
480: LIST_INSERT_HEAD(head, item, wk_list); \
481: } while (0)
482: #define WORKLIST_REMOVE(item) do { \
483: (item)->wk_state &= ~ONWORKLIST; \
484: LIST_REMOVE(item, wk_list); \
485: } while (0)
486: #define WORKITEM_FREE(item, type) softdep_freequeue_add((struct worklist *)item)
487:
488: #else /* DEBUG */
489: STATIC void worklist_insert(struct workhead *, struct worklist *);
490: STATIC void worklist_remove(struct worklist *);
491: STATIC void workitem_free(struct worklist *);
492:
493: #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
494: #define WORKLIST_REMOVE(item) worklist_remove(item)
495: #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item)
496:
497: STATIC void
498: worklist_insert(head, item)
499: struct workhead *head;
500: struct worklist *item;
501: {
502:
503: if (lk.lkt_held == -1)
504: panic("worklist_insert: lock not held");
505: if (item->wk_state & ONWORKLIST) {
506: FREE_LOCK(&lk);
507: panic("worklist_insert: already on list");
508: }
509: item->wk_state |= ONWORKLIST;
510: LIST_INSERT_HEAD(head, item, wk_list);
511: }
512:
513: STATIC void
514: worklist_remove(item)
515: struct worklist *item;
516: {
517:
518: if (lk.lkt_held == -1)
519: panic("worklist_remove: lock not held");
520: if ((item->wk_state & ONWORKLIST) == 0) {
521: FREE_LOCK(&lk);
522: panic("worklist_remove: not on list");
523: }
524: item->wk_state &= ~ONWORKLIST;
525: LIST_REMOVE(item, wk_list);
526: }
527:
528: STATIC void
529: workitem_free(item)
530: struct worklist *item;
531: {
532:
533: if (item->wk_state & ONWORKLIST) {
534: if (lk.lkt_held != -1)
535: FREE_LOCK(&lk);
536: panic("workitem_free: still on list");
537: }
538: softdep_freequeue_add(item);
539: }
540: #endif /* DEBUG */
541:
542: /*
543: * Workitem queue management
544: */
545: STATIC struct workhead softdep_workitem_pending;
546: STATIC struct worklist *worklist_tail;
547: STATIC int num_on_worklist; /* number of worklist items to be processed */
548: STATIC int softdep_worklist_busy; /* 1 => trying to do unmount */
549: STATIC int softdep_worklist_req; /* serialized waiters */
550: STATIC int max_softdeps; /* maximum number of structs before slowdown */
551: STATIC int tickdelay = 2; /* number of ticks to pause during slowdown */
552: STATIC int proc_waiting; /* tracks whether we have a timeout posted */
553: STATIC int *stat_countp; /* statistic to count in proc_waiting timeout */
554: STATIC struct timeout proc_waiting_timeout;
555: STATIC struct proc *filesys_syncer; /* proc of filesystem syncer process */
556: STATIC int req_clear_inodedeps; /* syncer process flush some inodedeps */
557: #define FLUSH_INODES 1
558: STATIC int req_clear_remove; /* syncer process flush some freeblks */
559: #define FLUSH_REMOVE 2
560: /*
561: * runtime statistics
562: */
563: STATIC int stat_worklist_push; /* number of worklist cleanups */
564: STATIC int stat_blk_limit_push; /* number of times block limit neared */
565: STATIC int stat_ino_limit_push; /* number of times inode limit neared */
566: STATIC int stat_blk_limit_hit; /* number of times block slowdown imposed */
567: STATIC int stat_ino_limit_hit; /* number of times inode slowdown imposed */
568: STATIC int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
569: STATIC int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
570: STATIC int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
571: STATIC int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
572: STATIC int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
573:
574: /*
575: * Add an item to the end of the work queue.
576: * This routine requires that the lock be held.
577: * This is the only routine that adds items to the list.
578: * The following routine is the only one that removes items
579: * and does so in order from first to last.
580: */
581: STATIC void
582: add_to_worklist(wk)
583: struct worklist *wk;
584: {
585:
586: if (wk->wk_state & ONWORKLIST) {
587: #ifdef DEBUG
588: if (lk.lkt_held != -1)
589: FREE_LOCK(&lk);
590: #endif
591: panic("add_to_worklist: already on list");
592: }
593: wk->wk_state |= ONWORKLIST;
594: if (LIST_FIRST(&softdep_workitem_pending) == NULL)
595: LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
596: else
597: LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
598: worklist_tail = wk;
599: num_on_worklist += 1;
600: }
601:
602: /*
603: * Process that runs once per second to handle items in the background queue.
604: *
605: * Note that we ensure that everything is done in the order in which they
606: * appear in the queue. The code below depends on this property to ensure
607: * that blocks of a file are freed before the inode itself is freed. This
608: * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
609: * until all the old ones have been purged from the dependency lists.
610: */
611: int
612: softdep_process_worklist(matchmnt)
613: struct mount *matchmnt;
614: {
615: struct proc *p = CURPROC;
616: int matchcnt, loopcount;
617: struct timeval starttime;
618:
619: /*
620: * First process any items on the delayed-free queue.
621: */
622: ACQUIRE_LOCK(&lk);
623: softdep_freequeue_process();
624: FREE_LOCK(&lk);
625:
626: /*
627: * Record the process identifier of our caller so that we can give
628: * this process preferential treatment in request_cleanup below.
629: * We can't do this in softdep_initialize, because the syncer doesn't
630: * have to run then.
631: * NOTE! This function _could_ be called with a curproc != syncerproc.
632: */
633: filesys_syncer = syncerproc;
634: matchcnt = 0;
635:
636: /*
637: * There is no danger of having multiple processes run this
638: * code, but we have to single-thread it when softdep_flushfiles()
639: * is in operation to get an accurate count of the number of items
640: * related to its mount point that are in the list.
641: */
642: if (matchmnt == NULL) {
643: if (softdep_worklist_busy < 0)
644: return(-1);
645: softdep_worklist_busy += 1;
646: }
647:
648: /*
649: * If requested, try removing inode or removal dependencies.
650: */
651: if (req_clear_inodedeps) {
652: clear_inodedeps(p);
653: req_clear_inodedeps -= 1;
654: wakeup_one(&proc_waiting);
655: }
656: if (req_clear_remove) {
657: clear_remove(p);
658: req_clear_remove -= 1;
659: wakeup_one(&proc_waiting);
660: }
661: loopcount = 1;
662: getmicrouptime(&starttime);
663: while (num_on_worklist > 0) {
664: matchcnt += process_worklist_item(matchmnt, 0);
665:
666: /*
667: * If a umount operation wants to run the worklist
668: * accurately, abort.
669: */
670: if (softdep_worklist_req && matchmnt == NULL) {
671: matchcnt = -1;
672: break;
673: }
674:
675: /*
676: * If requested, try removing inode or removal dependencies.
677: */
678: if (req_clear_inodedeps) {
679: clear_inodedeps(p);
680: req_clear_inodedeps -= 1;
681: wakeup_one(&proc_waiting);
682: }
683: if (req_clear_remove) {
684: clear_remove(p);
685: req_clear_remove -= 1;
686: wakeup_one(&proc_waiting);
687: }
688: /*
689: * We do not generally want to stop for buffer space, but if
690: * we are really being a buffer hog, we will stop and wait.
691: */
692: #if 0
693: if (loopcount++ % 128 == 0)
694: bwillwrite();
695: #endif
696: /*
697: * Never allow processing to run for more than one
698: * second. Otherwise the other syncer tasks may get
699: * excessively backlogged.
700: */
701: {
702: struct timeval diff;
703: struct timeval tv;
704:
705: getmicrouptime(&tv);
706: timersub(&tv, &starttime, &diff);
707: if (diff.tv_sec != 0 && matchmnt == NULL) {
708: matchcnt = -1;
709: break;
710: }
711: }
712:
713: /*
714: * Process any new items on the delayed-free queue.
715: */
716: ACQUIRE_LOCK(&lk);
717: softdep_freequeue_process();
718: FREE_LOCK(&lk);
719: }
720: if (matchmnt == NULL) {
721: softdep_worklist_busy -= 1;
722: if (softdep_worklist_req && softdep_worklist_busy == 0)
723: wakeup(&softdep_worklist_req);
724: }
725: return (matchcnt);
726: }
727:
728: /*
729: * Process one item on the worklist.
730: */
731: STATIC int
732: process_worklist_item(matchmnt, flags)
733: struct mount *matchmnt;
734: int flags;
735: {
736: struct worklist *wk, *wkend;
737: struct dirrem *dirrem;
738: struct mount *mp;
739: struct vnode *vp;
740: int matchcnt = 0;
741:
742: ACQUIRE_LOCK(&lk);
743: /*
744: * Normally we just process each item on the worklist in order.
745: * However, if we are in a situation where we cannot lock any
746: * inodes, we have to skip over any dirrem requests whose
747: * vnodes are resident and locked.
748: */
749: LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
750: if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
751: break;
752: dirrem = WK_DIRREM(wk);
753: vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
754: dirrem->dm_oldinum);
755: if (vp == NULL || !VOP_ISLOCKED(vp))
756: break;
757: }
758: if (wk == 0) {
759: FREE_LOCK(&lk);
760: return (0);
761: }
762: /*
763: * Remove the item to be processed. If we are removing the last
764: * item on the list, we need to recalculate the tail pointer.
765: * As this happens rarely and usually when the list is short,
766: * we just run down the list to find it rather than tracking it
767: * in the above loop.
768: */
769: WORKLIST_REMOVE(wk);
770: if (wk == worklist_tail) {
771: LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
772: if (LIST_NEXT(wkend, wk_list) == NULL)
773: break;
774: worklist_tail = wkend;
775: }
776: num_on_worklist -= 1;
777: FREE_LOCK(&lk);
778: switch (wk->wk_type) {
779:
780: case D_DIRREM:
781: /* removal of a directory entry */
782: mp = WK_DIRREM(wk)->dm_mnt;
783: #if 0
784: if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
785: panic("%s: dirrem on suspended filesystem",
786: "process_worklist_item");
787: #endif
788: if (mp == matchmnt)
789: matchcnt += 1;
790: handle_workitem_remove(WK_DIRREM(wk));
791: break;
792:
793: case D_FREEBLKS:
794: /* releasing blocks and/or fragments from a file */
795: mp = WK_FREEBLKS(wk)->fb_mnt;
796: #if 0
797: if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
798: panic("%s: freeblks on suspended filesystem",
799: "process_worklist_item");
800: #endif
801: if (mp == matchmnt)
802: matchcnt += 1;
803: handle_workitem_freeblocks(WK_FREEBLKS(wk));
804: break;
805:
806: case D_FREEFRAG:
807: /* releasing a fragment when replaced as a file grows */
808: mp = WK_FREEFRAG(wk)->ff_mnt;
809: #if 0
810: if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
811: panic("%s: freefrag on suspended filesystem",
812: "process_worklist_item");
813: #endif
814: if (mp == matchmnt)
815: matchcnt += 1;
816: handle_workitem_freefrag(WK_FREEFRAG(wk));
817: break;
818:
819: case D_FREEFILE:
820: /* releasing an inode when its link count drops to 0 */
821: mp = WK_FREEFILE(wk)->fx_mnt;
822: #if 0
823: if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
824: panic("%s: freefile on suspended filesystem",
825: "process_worklist_item");
826: #endif
827: if (mp == matchmnt)
828: matchcnt += 1;
829: handle_workitem_freefile(WK_FREEFILE(wk));
830: break;
831:
832: default:
833: panic("%s_process_worklist: Unknown type %s",
834: "softdep", TYPENAME(wk->wk_type));
835: /* NOTREACHED */
836: }
837: return (matchcnt);
838: }
839:
840: /*
841: * Move dependencies from one buffer to another.
842: */
843: void
844: softdep_move_dependencies(oldbp, newbp)
845: struct buf *oldbp;
846: struct buf *newbp;
847: {
848: struct worklist *wk, *wktail;
849:
850: if (LIST_FIRST(&newbp->b_dep) != NULL)
851: panic("softdep_move_dependencies: need merge code");
852: wktail = 0;
853: ACQUIRE_LOCK(&lk);
854: while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
855: LIST_REMOVE(wk, wk_list);
856: if (wktail == 0)
857: LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
858: else
859: LIST_INSERT_AFTER(wktail, wk, wk_list);
860: wktail = wk;
861: }
862: FREE_LOCK(&lk);
863: }
864:
865: /*
866: * Purge the work list of all items associated with a particular mount point.
867: */
868: int
869: softdep_flushworklist(oldmnt, countp, p)
870: struct mount *oldmnt;
871: int *countp;
872: struct proc *p;
873: {
874: struct vnode *devvp;
875: int count, error = 0;
876:
877: /*
878: * Await our turn to clear out the queue, then serialize access.
879: */
880: while (softdep_worklist_busy) {
881: softdep_worklist_req += 1;
882: tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
883: softdep_worklist_req -= 1;
884: }
885: softdep_worklist_busy = -1;
886: /*
887: * Alternately flush the block device associated with the mount
888: * point and process any dependencies that the flushing
889: * creates. We continue until no more worklist dependencies
890: * are found.
891: */
892: *countp = 0;
893: devvp = VFSTOUFS(oldmnt)->um_devvp;
894: while ((count = softdep_process_worklist(oldmnt)) > 0) {
895: *countp += count;
896: vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
897: error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
898: VOP_UNLOCK(devvp, 0, p);
899: if (error)
900: break;
901: }
902: softdep_worklist_busy = 0;
903: if (softdep_worklist_req)
904: wakeup(&softdep_worklist_req);
905: return (error);
906: }
907:
908: /*
909: * Flush all vnodes and worklist items associated with a specified mount point.
910: */
911: int
912: softdep_flushfiles(oldmnt, flags, p)
913: struct mount *oldmnt;
914: int flags;
915: struct proc *p;
916: {
917: int error, count, loopcnt;
918:
919: /*
920: * Alternately flush the vnodes associated with the mount
921: * point and process any dependencies that the flushing
922: * creates. In theory, this loop can happen at most twice,
923: * but we give it a few extra just to be sure.
924: */
925: for (loopcnt = 10; loopcnt > 0; loopcnt--) {
926: /*
927: * Do another flush in case any vnodes were brought in
928: * as part of the cleanup operations.
929: */
930: if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
931: break;
932: if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
933: count == 0)
934: break;
935: }
936: /*
937: * If we are unmounting then it is an error to fail. If we
938: * are simply trying to downgrade to read-only, then filesystem
939: * activity can keep us busy forever, so we just fail with EBUSY.
940: */
941: if (loopcnt == 0) {
942: error = EBUSY;
943: }
944: return (error);
945: }
946:
947: /*
948: * Structure hashing.
949: *
950: * There are three types of structures that can be looked up:
951: * 1) pagedep structures identified by mount point, inode number,
952: * and logical block.
953: * 2) inodedep structures identified by mount point and inode number.
954: * 3) newblk structures identified by mount point and
955: * physical block number.
956: *
957: * The "pagedep" and "inodedep" dependency structures are hashed
958: * separately from the file blocks and inodes to which they correspond.
959: * This separation helps when the in-memory copy of an inode or
960: * file block must be replaced. It also obviates the need to access
961: * an inode or file page when simply updating (or de-allocating)
962: * dependency structures. Lookup of newblk structures is needed to
963: * find newly allocated blocks when trying to associate them with
964: * their allocdirect or allocindir structure.
965: *
966: * The lookup routines optionally create and hash a new instance when
967: * an existing entry is not found.
968: */
969: #define DEPALLOC 0x0001 /* allocate structure if lookup fails */
970: #define NODELAY 0x0002 /* cannot do background work */
971:
972: /*
973: * Structures and routines associated with pagedep caching.
974: */
975: LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
976: u_long pagedep_hash; /* size of hash table - 1 */
977: #define PAGEDEP_HASH(mp, inum, lbn) \
978: (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
979: pagedep_hash])
980: STATIC struct sema pagedep_in_progress;
981:
982: /*
983: * Look up a pagedep. Return 1 if found, 0 if not found or found
984: * when asked to allocate but not associated with any buffer.
985: * If not found, allocate if DEPALLOC flag is passed.
986: * Found or allocated entry is returned in pagedeppp.
987: * This routine must be called with splbio interrupts blocked.
988: */
989: STATIC int
990: pagedep_lookup(ip, lbn, flags, pagedeppp)
991: struct inode *ip;
992: daddr64_t lbn;
993: int flags;
994: struct pagedep **pagedeppp;
995: {
996: struct pagedep *pagedep;
997: struct pagedep_hashhead *pagedephd;
998: struct mount *mp;
999: int i;
1000:
1001: splassert(IPL_BIO);
1002:
1003: #ifdef DEBUG
1004: if (lk.lkt_held == -1)
1005: panic("pagedep_lookup: lock not held");
1006: #endif
1007: mp = ITOV(ip)->v_mount;
1008: pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1009: top:
1010: LIST_FOREACH(pagedep, pagedephd, pd_hash)
1011: if (ip->i_number == pagedep->pd_ino &&
1012: lbn == pagedep->pd_lbn &&
1013: mp == pagedep->pd_mnt)
1014: break;
1015: if (pagedep) {
1016: *pagedeppp = pagedep;
1017: if ((flags & DEPALLOC) != 0 &&
1018: (pagedep->pd_state & ONWORKLIST) == 0)
1019: return (0);
1020: return (1);
1021: }
1022: if ((flags & DEPALLOC) == 0) {
1023: *pagedeppp = NULL;
1024: return (0);
1025: }
1026: if (sema_get(&pagedep_in_progress, &lk) == 0) {
1027: ACQUIRE_LOCK(&lk);
1028: goto top;
1029: }
1030: pagedep = pool_get(&pagedep_pool, PR_WAITOK);
1031: bzero(pagedep, sizeof(struct pagedep));
1032: pagedep->pd_list.wk_type = D_PAGEDEP;
1033: pagedep->pd_mnt = mp;
1034: pagedep->pd_ino = ip->i_number;
1035: pagedep->pd_lbn = lbn;
1036: LIST_INIT(&pagedep->pd_dirremhd);
1037: LIST_INIT(&pagedep->pd_pendinghd);
1038: for (i = 0; i < DAHASHSZ; i++)
1039: LIST_INIT(&pagedep->pd_diraddhd[i]);
1040: ACQUIRE_LOCK(&lk);
1041: LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1042: sema_release(&pagedep_in_progress);
1043: *pagedeppp = pagedep;
1044: return (0);
1045: }
1046:
1047: /*
1048: * Structures and routines associated with inodedep caching.
1049: */
1050: LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1051: STATIC u_long inodedep_hash; /* size of hash table - 1 */
1052: STATIC long num_inodedep; /* number of inodedep allocated */
1053: #define INODEDEP_HASH(fs, inum) \
1054: (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1055: STATIC struct sema inodedep_in_progress;
1056:
1057: /*
1058: * Look up a inodedep. Return 1 if found, 0 if not found.
1059: * If not found, allocate if DEPALLOC flag is passed.
1060: * Found or allocated entry is returned in inodedeppp.
1061: * This routine must be called with splbio interrupts blocked.
1062: */
1063: STATIC int
1064: inodedep_lookup(fs, inum, flags, inodedeppp)
1065: struct fs *fs;
1066: ino_t inum;
1067: int flags;
1068: struct inodedep **inodedeppp;
1069: {
1070: struct inodedep *inodedep;
1071: struct inodedep_hashhead *inodedephd;
1072: int firsttry;
1073:
1074: splassert(IPL_BIO);
1075:
1076: #ifdef DEBUG
1077: if (lk.lkt_held == -1)
1078: panic("inodedep_lookup: lock not held");
1079: #endif
1080: firsttry = 1;
1081: inodedephd = INODEDEP_HASH(fs, inum);
1082: top:
1083: LIST_FOREACH(inodedep, inodedephd, id_hash)
1084: if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1085: break;
1086: if (inodedep) {
1087: *inodedeppp = inodedep;
1088: return (1);
1089: }
1090: if ((flags & DEPALLOC) == 0) {
1091: *inodedeppp = NULL;
1092: return (0);
1093: }
1094: /*
1095: * If we are over our limit, try to improve the situation.
1096: */
1097: if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
1098: request_cleanup(FLUSH_INODES, 1)) {
1099: firsttry = 0;
1100: goto top;
1101: }
1102: if (sema_get(&inodedep_in_progress, &lk) == 0) {
1103: ACQUIRE_LOCK(&lk);
1104: goto top;
1105: }
1106: num_inodedep += 1;
1107: inodedep = pool_get(&inodedep_pool, PR_WAITOK);
1108: inodedep->id_list.wk_type = D_INODEDEP;
1109: inodedep->id_fs = fs;
1110: inodedep->id_ino = inum;
1111: inodedep->id_state = ALLCOMPLETE;
1112: inodedep->id_nlinkdelta = 0;
1113: inodedep->id_savedino1 = NULL;
1114: inodedep->id_savedsize = -1;
1115: inodedep->id_buf = NULL;
1116: LIST_INIT(&inodedep->id_pendinghd);
1117: LIST_INIT(&inodedep->id_inowait);
1118: LIST_INIT(&inodedep->id_bufwait);
1119: TAILQ_INIT(&inodedep->id_inoupdt);
1120: TAILQ_INIT(&inodedep->id_newinoupdt);
1121: ACQUIRE_LOCK(&lk);
1122: LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1123: sema_release(&inodedep_in_progress);
1124: *inodedeppp = inodedep;
1125: return (0);
1126: }
1127:
1128: /*
1129: * Structures and routines associated with newblk caching.
1130: */
1131: LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1132: u_long newblk_hash; /* size of hash table - 1 */
1133: #define NEWBLK_HASH(fs, inum) \
1134: (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1135: STATIC struct sema newblk_in_progress;
1136:
1137: /*
1138: * Look up a newblk. Return 1 if found, 0 if not found.
1139: * If not found, allocate if DEPALLOC flag is passed.
1140: * Found or allocated entry is returned in newblkpp.
1141: */
1142: STATIC int
1143: newblk_lookup(fs, newblkno, flags, newblkpp)
1144: struct fs *fs;
1145: daddr_t newblkno;
1146: int flags;
1147: struct newblk **newblkpp;
1148: {
1149: struct newblk *newblk;
1150: struct newblk_hashhead *newblkhd;
1151:
1152: newblkhd = NEWBLK_HASH(fs, newblkno);
1153: top:
1154: LIST_FOREACH(newblk, newblkhd, nb_hash)
1155: if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1156: break;
1157: if (newblk) {
1158: *newblkpp = newblk;
1159: return (1);
1160: }
1161: if ((flags & DEPALLOC) == 0) {
1162: *newblkpp = NULL;
1163: return (0);
1164: }
1165: if (sema_get(&newblk_in_progress, 0) == 0)
1166: goto top;
1167: newblk = pool_get(&newblk_pool, PR_WAITOK);
1168: newblk->nb_state = 0;
1169: newblk->nb_fs = fs;
1170: newblk->nb_newblkno = newblkno;
1171: LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1172: sema_release(&newblk_in_progress);
1173: *newblkpp = newblk;
1174: return (0);
1175: }
1176:
1177: /*
1178: * Executed during filesystem system initialization before
1179: * mounting any file systems.
1180: */
1181: void
1182: softdep_initialize()
1183: {
1184:
1185: bioops.io_start = softdep_disk_io_initiation;
1186: bioops.io_complete = softdep_disk_write_complete;
1187: bioops.io_deallocate = softdep_deallocate_dependencies;
1188: bioops.io_movedeps = softdep_move_dependencies;
1189: bioops.io_countdeps = softdep_count_dependencies;
1190:
1191: LIST_INIT(&mkdirlisthd);
1192: LIST_INIT(&softdep_workitem_pending);
1193: #ifdef KMEMSTATS
1194: max_softdeps = min (desiredvnodes * 8,
1195: kmemstats[M_INODEDEP].ks_limit / (2 * sizeof(struct inodedep)));
1196: #else
1197: max_softdeps = desiredvnodes * 4;
1198: #endif
1199: pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, M_WAITOK,
1200: &pagedep_hash);
1201: sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1202: inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, M_WAITOK,
1203: &inodedep_hash);
1204: sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1205: newblk_hashtbl = hashinit(64, M_NEWBLK, M_WAITOK, &newblk_hash);
1206: sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1207: timeout_set(&proc_waiting_timeout, pause_timer, 0);
1208: pool_init(&pagedep_pool, sizeof(struct pagedep), 0, 0, 0,
1209: "pagedeppl", &pool_allocator_nointr);
1210: pool_init(&inodedep_pool, sizeof(struct inodedep), 0, 0, 0,
1211: "inodedeppl", &pool_allocator_nointr);
1212: pool_init(&newblk_pool, sizeof(struct newblk), 0, 0, 0,
1213: "newblkpl", &pool_allocator_nointr);
1214: pool_init(&bmsafemap_pool, sizeof(struct bmsafemap), 0, 0, 0,
1215: "bmsafemappl", &pool_allocator_nointr);
1216: pool_init(&allocdirect_pool, sizeof(struct allocdirect), 0, 0, 0,
1217: "allocdirectpl", &pool_allocator_nointr);
1218: pool_init(&indirdep_pool, sizeof(struct indirdep), 0, 0, 0,
1219: "indirdeppl", &pool_allocator_nointr);
1220: pool_init(&allocindir_pool, sizeof(struct allocindir), 0, 0, 0,
1221: "allocindirpl", &pool_allocator_nointr);
1222: pool_init(&freefrag_pool, sizeof(struct freefrag), 0, 0, 0,
1223: "freefragpl", &pool_allocator_nointr);
1224: pool_init(&freeblks_pool, sizeof(struct freeblks), 0, 0, 0,
1225: "freeblkspl", &pool_allocator_nointr);
1226: pool_init(&freefile_pool, sizeof(struct freefile), 0, 0, 0,
1227: "freefilepl", &pool_allocator_nointr);
1228: pool_init(&diradd_pool, sizeof(struct diradd), 0, 0, 0,
1229: "diraddpl", &pool_allocator_nointr);
1230: pool_init(&mkdir_pool, sizeof(struct mkdir), 0, 0, 0,
1231: "mkdirpl", &pool_allocator_nointr);
1232: pool_init(&dirrem_pool, sizeof(struct dirrem), 0, 0, 0,
1233: "dirrempl", &pool_allocator_nointr);
1234: pool_init(&newdirblk_pool, sizeof(struct newdirblk), 0, 0, 0,
1235: "newdirblkpl", &pool_allocator_nointr);
1236: }
1237:
1238: /*
1239: * Called at mount time to notify the dependency code that a
1240: * filesystem wishes to use it.
1241: */
1242: int
1243: softdep_mount(devvp, mp, fs, cred)
1244: struct vnode *devvp;
1245: struct mount *mp;
1246: struct fs *fs;
1247: struct ucred *cred;
1248: {
1249: struct csum_total cstotal;
1250: struct cg *cgp;
1251: struct buf *bp;
1252: int error, cyl;
1253:
1254: /*
1255: * When doing soft updates, the counters in the
1256: * superblock may have gotten out of sync, so we have
1257: * to scan the cylinder groups and recalculate them.
1258: */
1259: if ((fs->fs_flags & FS_UNCLEAN) == 0)
1260: return (0);
1261: bzero(&cstotal, sizeof cstotal);
1262: for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1263: if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1264: fs->fs_cgsize, cred, &bp)) != 0) {
1265: brelse(bp);
1266: return (error);
1267: }
1268: cgp = (struct cg *)bp->b_data;
1269: cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1270: cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1271: cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1272: cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1273: fs->fs_cs(fs, cyl) = cgp->cg_cs;
1274: brelse(bp);
1275: }
1276: #ifdef DEBUG
1277: if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1278: printf("ffs_mountfs: superblock updated for soft updates\n");
1279: #endif
1280: bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1281: return (0);
1282: }
1283:
1284: /*
1285: * Protecting the freemaps (or bitmaps).
1286: *
1287: * To eliminate the need to execute fsck before mounting a file system
1288: * after a power failure, one must (conservatively) guarantee that the
1289: * on-disk copy of the bitmaps never indicate that a live inode or block is
1290: * free. So, when a block or inode is allocated, the bitmap should be
1291: * updated (on disk) before any new pointers. When a block or inode is
1292: * freed, the bitmap should not be updated until all pointers have been
1293: * reset. The latter dependency is handled by the delayed de-allocation
1294: * approach described below for block and inode de-allocation. The former
1295: * dependency is handled by calling the following procedure when a block or
1296: * inode is allocated. When an inode is allocated an "inodedep" is created
1297: * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1298: * Each "inodedep" is also inserted into the hash indexing structure so
1299: * that any additional link additions can be made dependent on the inode
1300: * allocation.
1301: *
1302: * The ufs file system maintains a number of free block counts (e.g., per
1303: * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1304: * in addition to the bitmaps. These counts are used to improve efficiency
1305: * during allocation and therefore must be consistent with the bitmaps.
1306: * There is no convenient way to guarantee post-crash consistency of these
1307: * counts with simple update ordering, for two main reasons: (1) The counts
1308: * and bitmaps for a single cylinder group block are not in the same disk
1309: * sector. If a disk write is interrupted (e.g., by power failure), one may
1310: * be written and the other not. (2) Some of the counts are located in the
1311: * superblock rather than the cylinder group block. So, we focus our soft
1312: * updates implementation on protecting the bitmaps. When mounting a
1313: * filesystem, we recompute the auxiliary counts from the bitmaps.
1314: */
1315:
1316: /*
1317: * Called just after updating the cylinder group block to allocate an inode.
1318: */
1319: void
1320: softdep_setup_inomapdep(bp, ip, newinum)
1321: struct buf *bp; /* buffer for cylgroup block with inode map */
1322: struct inode *ip; /* inode related to allocation */
1323: ino_t newinum; /* new inode number being allocated */
1324: {
1325: struct inodedep *inodedep;
1326: struct bmsafemap *bmsafemap;
1327:
1328: /*
1329: * Create a dependency for the newly allocated inode.
1330: * Panic if it already exists as something is seriously wrong.
1331: * Otherwise add it to the dependency list for the buffer holding
1332: * the cylinder group map from which it was allocated.
1333: */
1334: ACQUIRE_LOCK(&lk);
1335: if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC | NODELAY, &inodedep)
1336: != 0) {
1337: FREE_LOCK(&lk);
1338: panic("softdep_setup_inomapdep: found inode");
1339: }
1340: inodedep->id_buf = bp;
1341: inodedep->id_state &= ~DEPCOMPLETE;
1342: bmsafemap = bmsafemap_lookup(bp);
1343: LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1344: FREE_LOCK(&lk);
1345: }
1346:
1347: /*
1348: * Called just after updating the cylinder group block to
1349: * allocate block or fragment.
1350: */
1351: void
1352: softdep_setup_blkmapdep(bp, fs, newblkno)
1353: struct buf *bp; /* buffer for cylgroup block with block map */
1354: struct fs *fs; /* filesystem doing allocation */
1355: daddr_t newblkno; /* number of newly allocated block */
1356: {
1357: struct newblk *newblk;
1358: struct bmsafemap *bmsafemap;
1359:
1360: /*
1361: * Create a dependency for the newly allocated block.
1362: * Add it to the dependency list for the buffer holding
1363: * the cylinder group map from which it was allocated.
1364: */
1365: if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1366: panic("softdep_setup_blkmapdep: found block");
1367: ACQUIRE_LOCK(&lk);
1368: newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1369: LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1370: FREE_LOCK(&lk);
1371: }
1372:
1373: /*
1374: * Find the bmsafemap associated with a cylinder group buffer.
1375: * If none exists, create one. The buffer must be locked when
1376: * this routine is called and this routine must be called with
1377: * splbio interrupts blocked.
1378: */
1379: STATIC struct bmsafemap *
1380: bmsafemap_lookup(bp)
1381: struct buf *bp;
1382: {
1383: struct bmsafemap *bmsafemap;
1384: struct worklist *wk;
1385:
1386: splassert(IPL_BIO);
1387:
1388: #ifdef DEBUG
1389: if (lk.lkt_held == -1)
1390: panic("bmsafemap_lookup: lock not held");
1391: #endif
1392: LIST_FOREACH(wk, &bp->b_dep, wk_list)
1393: if (wk->wk_type == D_BMSAFEMAP)
1394: return (WK_BMSAFEMAP(wk));
1395: FREE_LOCK(&lk);
1396: bmsafemap = pool_get(&bmsafemap_pool, PR_WAITOK);
1397: bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1398: bmsafemap->sm_list.wk_state = 0;
1399: bmsafemap->sm_buf = bp;
1400: LIST_INIT(&bmsafemap->sm_allocdirecthd);
1401: LIST_INIT(&bmsafemap->sm_allocindirhd);
1402: LIST_INIT(&bmsafemap->sm_inodedephd);
1403: LIST_INIT(&bmsafemap->sm_newblkhd);
1404: ACQUIRE_LOCK(&lk);
1405: WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1406: return (bmsafemap);
1407: }
1408:
1409: /*
1410: * Direct block allocation dependencies.
1411: *
1412: * When a new block is allocated, the corresponding disk locations must be
1413: * initialized (with zeros or new data) before the on-disk inode points to
1414: * them. Also, the freemap from which the block was allocated must be
1415: * updated (on disk) before the inode's pointer. These two dependencies are
1416: * independent of each other and are needed for all file blocks and indirect
1417: * blocks that are pointed to directly by the inode. Just before the
1418: * "in-core" version of the inode is updated with a newly allocated block
1419: * number, a procedure (below) is called to setup allocation dependency
1420: * structures. These structures are removed when the corresponding
1421: * dependencies are satisfied or when the block allocation becomes obsolete
1422: * (i.e., the file is deleted, the block is de-allocated, or the block is a
1423: * fragment that gets upgraded). All of these cases are handled in
1424: * procedures described later.
1425: *
1426: * When a file extension causes a fragment to be upgraded, either to a larger
1427: * fragment or to a full block, the on-disk location may change (if the
1428: * previous fragment could not simply be extended). In this case, the old
1429: * fragment must be de-allocated, but not until after the inode's pointer has
1430: * been updated. In most cases, this is handled by later procedures, which
1431: * will construct a "freefrag" structure to be added to the workitem queue
1432: * when the inode update is complete (or obsolete). The main exception to
1433: * this is when an allocation occurs while a pending allocation dependency
1434: * (for the same block pointer) remains. This case is handled in the main
1435: * allocation dependency setup procedure by immediately freeing the
1436: * unreferenced fragments.
1437: */
1438: void
1439: softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1440: struct inode *ip; /* inode to which block is being added */
1441: daddr64_t lbn; /* block pointer within inode */
1442: daddr_t newblkno; /* disk block number being added */
1443: daddr_t oldblkno; /* previous block number, 0 unless frag */
1444: long newsize; /* size of new block */
1445: long oldsize; /* size of new block */
1446: struct buf *bp; /* bp for allocated block */
1447: {
1448: struct allocdirect *adp, *oldadp;
1449: struct allocdirectlst *adphead;
1450: struct bmsafemap *bmsafemap;
1451: struct inodedep *inodedep;
1452: struct pagedep *pagedep;
1453: struct newblk *newblk;
1454:
1455: adp = pool_get(&allocdirect_pool, PR_WAITOK);
1456: bzero(adp, sizeof(struct allocdirect));
1457: adp->ad_list.wk_type = D_ALLOCDIRECT;
1458: adp->ad_lbn = lbn;
1459: adp->ad_newblkno = newblkno;
1460: adp->ad_oldblkno = oldblkno;
1461: adp->ad_newsize = newsize;
1462: adp->ad_oldsize = oldsize;
1463: adp->ad_state = ATTACHED;
1464: LIST_INIT(&adp->ad_newdirblk);
1465: if (newblkno == oldblkno)
1466: adp->ad_freefrag = NULL;
1467: else
1468: adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1469:
1470: if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1471: panic("softdep_setup_allocdirect: lost block");
1472:
1473: ACQUIRE_LOCK(&lk);
1474: inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1475: adp->ad_inodedep = inodedep;
1476:
1477: if (newblk->nb_state == DEPCOMPLETE) {
1478: adp->ad_state |= DEPCOMPLETE;
1479: adp->ad_buf = NULL;
1480: } else {
1481: bmsafemap = newblk->nb_bmsafemap;
1482: adp->ad_buf = bmsafemap->sm_buf;
1483: LIST_REMOVE(newblk, nb_deps);
1484: LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1485: }
1486: LIST_REMOVE(newblk, nb_hash);
1487: pool_put(&newblk_pool, newblk);
1488:
1489: if (bp == NULL) {
1490: /*
1491: * XXXUBC - Yes, I know how to fix this, but not right now.
1492: */
1493: panic("softdep_setup_allocdirect: Bonk art in the head");
1494: }
1495: WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1496: if (lbn >= NDADDR) {
1497: /* allocating an indirect block */
1498: if (oldblkno != 0) {
1499: FREE_LOCK(&lk);
1500: panic("softdep_setup_allocdirect: non-zero indir");
1501: }
1502: } else {
1503: /*
1504: * Allocating a direct block.
1505: *
1506: * If we are allocating a directory block, then we must
1507: * allocate an associated pagedep to track additions and
1508: * deletions.
1509: */
1510: if ((DIP(ip, mode) & IFMT) == IFDIR &&
1511: pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1512: WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1513: }
1514: /*
1515: * The list of allocdirects must be kept in sorted and ascending
1516: * order so that the rollback routines can quickly determine the
1517: * first uncommitted block (the size of the file stored on disk
1518: * ends at the end of the lowest committed fragment, or if there
1519: * are no fragments, at the end of the highest committed block).
1520: * Since files generally grow, the typical case is that the new
1521: * block is to be added at the end of the list. We speed this
1522: * special case by checking against the last allocdirect in the
1523: * list before laboriously traversing the list looking for the
1524: * insertion point.
1525: */
1526: adphead = &inodedep->id_newinoupdt;
1527: oldadp = TAILQ_LAST(adphead, allocdirectlst);
1528: if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1529: /* insert at end of list */
1530: TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1531: if (oldadp != NULL && oldadp->ad_lbn == lbn)
1532: allocdirect_merge(adphead, adp, oldadp);
1533: FREE_LOCK(&lk);
1534: return;
1535: }
1536: TAILQ_FOREACH(oldadp, adphead, ad_next) {
1537: if (oldadp->ad_lbn >= lbn)
1538: break;
1539: }
1540: if (oldadp == NULL) {
1541: FREE_LOCK(&lk);
1542: panic("softdep_setup_allocdirect: lost entry");
1543: }
1544: /* insert in middle of list */
1545: TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1546: if (oldadp->ad_lbn == lbn)
1547: allocdirect_merge(adphead, adp, oldadp);
1548: FREE_LOCK(&lk);
1549: }
1550:
1551: /*
1552: * Replace an old allocdirect dependency with a newer one.
1553: * This routine must be called with splbio interrupts blocked.
1554: */
1555: STATIC void
1556: allocdirect_merge(adphead, newadp, oldadp)
1557: struct allocdirectlst *adphead; /* head of list holding allocdirects */
1558: struct allocdirect *newadp; /* allocdirect being added */
1559: struct allocdirect *oldadp; /* existing allocdirect being checked */
1560: {
1561: struct worklist *wk;
1562: struct freefrag *freefrag;
1563: struct newdirblk *newdirblk;
1564:
1565: splassert(IPL_BIO);
1566:
1567: #ifdef DEBUG
1568: if (lk.lkt_held == -1)
1569: panic("allocdirect_merge: lock not held");
1570: #endif
1571: if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1572: newadp->ad_oldsize != oldadp->ad_newsize ||
1573: newadp->ad_lbn >= NDADDR) {
1574: FREE_LOCK(&lk);
1575: panic("allocdirect_merge: old %d != new %d || lbn %ld >= %d",
1576: newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1577: NDADDR);
1578: }
1579: newadp->ad_oldblkno = oldadp->ad_oldblkno;
1580: newadp->ad_oldsize = oldadp->ad_oldsize;
1581: /*
1582: * If the old dependency had a fragment to free or had never
1583: * previously had a block allocated, then the new dependency
1584: * can immediately post its freefrag and adopt the old freefrag.
1585: * This action is done by swapping the freefrag dependencies.
1586: * The new dependency gains the old one's freefrag, and the
1587: * old one gets the new one and then immediately puts it on
1588: * the worklist when it is freed by free_allocdirect. It is
1589: * not possible to do this swap when the old dependency had a
1590: * non-zero size but no previous fragment to free. This condition
1591: * arises when the new block is an extension of the old block.
1592: * Here, the first part of the fragment allocated to the new
1593: * dependency is part of the block currently claimed on disk by
1594: * the old dependency, so cannot legitimately be freed until the
1595: * conditions for the new dependency are fulfilled.
1596: */
1597: if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1598: freefrag = newadp->ad_freefrag;
1599: newadp->ad_freefrag = oldadp->ad_freefrag;
1600: oldadp->ad_freefrag = freefrag;
1601: }
1602: /*
1603: * If we are tracking a new directory-block allocation,
1604: * move it from the old allocdirect to the new allocdirect.
1605: */
1606: if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1607: newdirblk = WK_NEWDIRBLK(wk);
1608: WORKLIST_REMOVE(&newdirblk->db_list);
1609: if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
1610: panic("allocdirect_merge: extra newdirblk");
1611: WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1612: }
1613: free_allocdirect(adphead, oldadp, 0);
1614: }
1615:
1616: /*
1617: * Allocate a new freefrag structure if needed.
1618: */
1619: STATIC struct freefrag *
1620: newfreefrag(ip, blkno, size)
1621: struct inode *ip;
1622: daddr_t blkno;
1623: long size;
1624: {
1625: struct freefrag *freefrag;
1626: struct fs *fs;
1627:
1628: if (blkno == 0)
1629: return (NULL);
1630: fs = ip->i_fs;
1631: if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1632: panic("newfreefrag: frag size");
1633: freefrag = pool_get(&freefrag_pool, PR_WAITOK);
1634: freefrag->ff_list.wk_type = D_FREEFRAG;
1635: freefrag->ff_state = DIP(ip, uid) & ~ONWORKLIST; /* used below */
1636: freefrag->ff_inum = ip->i_number;
1637: freefrag->ff_mnt = ITOV(ip)->v_mount;
1638: freefrag->ff_devvp = ip->i_devvp;
1639: freefrag->ff_blkno = blkno;
1640: freefrag->ff_fragsize = size;
1641: return (freefrag);
1642: }
1643:
1644: /*
1645: * This workitem de-allocates fragments that were replaced during
1646: * file block allocation.
1647: */
1648: STATIC void
1649: handle_workitem_freefrag(freefrag)
1650: struct freefrag *freefrag;
1651: {
1652: struct inode tip;
1653: struct ufs1_dinode dtip1;
1654:
1655: tip.i_vnode = NULL;
1656: tip.i_din1 = &dtip1;
1657: tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
1658: tip.i_ump = VFSTOUFS(freefrag->ff_mnt);
1659: tip.i_dev = freefrag->ff_devvp->v_rdev;
1660: tip.i_number = freefrag->ff_inum;
1661: tip.i_ffs1_uid = freefrag->ff_state & ~ONWORKLIST; /* set above */
1662: ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1663: pool_put(&freefrag_pool, freefrag);
1664: }
1665:
1666: /*
1667: * Indirect block allocation dependencies.
1668: *
1669: * The same dependencies that exist for a direct block also exist when
1670: * a new block is allocated and pointed to by an entry in a block of
1671: * indirect pointers. The undo/redo states described above are also
1672: * used here. Because an indirect block contains many pointers that
1673: * may have dependencies, a second copy of the entire in-memory indirect
1674: * block is kept. The buffer cache copy is always completely up-to-date.
1675: * The second copy, which is used only as a source for disk writes,
1676: * contains only the safe pointers (i.e., those that have no remaining
1677: * update dependencies). The second copy is freed when all pointers
1678: * are safe. The cache is not allowed to replace indirect blocks with
1679: * pending update dependencies. If a buffer containing an indirect
1680: * block with dependencies is written, these routines will mark it
1681: * dirty again. It can only be successfully written once all the
1682: * dependencies are removed. The ffs_fsync routine in conjunction with
1683: * softdep_sync_metadata work together to get all the dependencies
1684: * removed so that a file can be successfully written to disk. Three
1685: * procedures are used when setting up indirect block pointer
1686: * dependencies. The division is necessary because of the organization
1687: * of the "balloc" routine and because of the distinction between file
1688: * pages and file metadata blocks.
1689: */
1690:
1691: /*
1692: * Allocate a new allocindir structure.
1693: */
1694: STATIC struct allocindir *
1695: newallocindir(ip, ptrno, newblkno, oldblkno)
1696: struct inode *ip; /* inode for file being extended */
1697: int ptrno; /* offset of pointer in indirect block */
1698: daddr_t newblkno; /* disk block number being added */
1699: daddr_t oldblkno; /* previous block number, 0 if none */
1700: {
1701: struct allocindir *aip;
1702:
1703: aip = pool_get(&allocindir_pool, PR_WAITOK);
1704: bzero(aip,sizeof(struct allocindir));
1705: aip->ai_list.wk_type = D_ALLOCINDIR;
1706: aip->ai_state = ATTACHED;
1707: aip->ai_offset = ptrno;
1708: aip->ai_newblkno = newblkno;
1709: aip->ai_oldblkno = oldblkno;
1710: aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1711: return (aip);
1712: }
1713:
1714: /*
1715: * Called just before setting an indirect block pointer
1716: * to a newly allocated file page.
1717: */
1718: void
1719: softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1720: struct inode *ip; /* inode for file being extended */
1721: daddr64_t lbn; /* allocated block number within file */
1722: struct buf *bp; /* buffer with indirect blk referencing page */
1723: int ptrno; /* offset of pointer in indirect block */
1724: daddr_t newblkno; /* disk block number being added */
1725: daddr_t oldblkno; /* previous block number, 0 if none */
1726: struct buf *nbp; /* buffer holding allocated page */
1727: {
1728: struct allocindir *aip;
1729: struct pagedep *pagedep;
1730:
1731: aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1732: ACQUIRE_LOCK(&lk);
1733: /*
1734: * If we are allocating a directory page, then we must
1735: * allocate an associated pagedep to track additions and
1736: * deletions.
1737: */
1738: if ((DIP(ip, mode) & IFMT) == IFDIR &&
1739: pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1740: WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1741: if (nbp == NULL) {
1742: /*
1743: * XXXUBC - Yes, I know how to fix this, but not right now.
1744: */
1745: panic("softdep_setup_allocindir_page: Bonk art in the head");
1746: }
1747: WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1748: FREE_LOCK(&lk);
1749: setup_allocindir_phase2(bp, ip, aip);
1750: }
1751:
1752: /*
1753: * Called just before setting an indirect block pointer to a
1754: * newly allocated indirect block.
1755: */
1756: void
1757: softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1758: struct buf *nbp; /* newly allocated indirect block */
1759: struct inode *ip; /* inode for file being extended */
1760: struct buf *bp; /* indirect block referencing allocated block */
1761: int ptrno; /* offset of pointer in indirect block */
1762: daddr_t newblkno; /* disk block number being added */
1763: {
1764: struct allocindir *aip;
1765:
1766: aip = newallocindir(ip, ptrno, newblkno, 0);
1767: ACQUIRE_LOCK(&lk);
1768: WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1769: FREE_LOCK(&lk);
1770: setup_allocindir_phase2(bp, ip, aip);
1771: }
1772:
1773: /*
1774: * Called to finish the allocation of the "aip" allocated
1775: * by one of the two routines above.
1776: */
1777: STATIC void
1778: setup_allocindir_phase2(bp, ip, aip)
1779: struct buf *bp; /* in-memory copy of the indirect block */
1780: struct inode *ip; /* inode for file being extended */
1781: struct allocindir *aip; /* allocindir allocated by the above routines */
1782: {
1783: struct worklist *wk;
1784: struct indirdep *indirdep, *newindirdep;
1785: struct bmsafemap *bmsafemap;
1786: struct allocindir *oldaip;
1787: struct freefrag *freefrag;
1788: struct newblk *newblk;
1789:
1790: if (bp->b_lblkno >= 0)
1791: panic("setup_allocindir_phase2: not indir blk");
1792: for (indirdep = NULL, newindirdep = NULL; ; ) {
1793: ACQUIRE_LOCK(&lk);
1794: LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1795: if (wk->wk_type != D_INDIRDEP)
1796: continue;
1797: indirdep = WK_INDIRDEP(wk);
1798: break;
1799: }
1800: if (indirdep == NULL && newindirdep) {
1801: indirdep = newindirdep;
1802: WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1803: newindirdep = NULL;
1804: }
1805: FREE_LOCK(&lk);
1806: if (indirdep) {
1807: if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1808: &newblk) == 0)
1809: panic("setup_allocindir: lost block");
1810: ACQUIRE_LOCK(&lk);
1811: if (newblk->nb_state == DEPCOMPLETE) {
1812: aip->ai_state |= DEPCOMPLETE;
1813: aip->ai_buf = NULL;
1814: } else {
1815: bmsafemap = newblk->nb_bmsafemap;
1816: aip->ai_buf = bmsafemap->sm_buf;
1817: LIST_REMOVE(newblk, nb_deps);
1818: LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1819: aip, ai_deps);
1820: }
1821: LIST_REMOVE(newblk, nb_hash);
1822: pool_put(&newblk_pool, newblk);
1823: aip->ai_indirdep = indirdep;
1824: /*
1825: * Check to see if there is an existing dependency
1826: * for this block. If there is, merge the old
1827: * dependency into the new one.
1828: */
1829: if (aip->ai_oldblkno == 0)
1830: oldaip = NULL;
1831: else
1832:
1833: LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1834: if (oldaip->ai_offset == aip->ai_offset)
1835: break;
1836: freefrag = NULL;
1837: if (oldaip != NULL) {
1838: if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1839: FREE_LOCK(&lk);
1840: panic("setup_allocindir_phase2: blkno");
1841: }
1842: aip->ai_oldblkno = oldaip->ai_oldblkno;
1843: freefrag = aip->ai_freefrag;
1844: aip->ai_freefrag = oldaip->ai_freefrag;
1845: oldaip->ai_freefrag = NULL;
1846: free_allocindir(oldaip, NULL);
1847: }
1848: LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1849: if (ip->i_ump->um_fstype == UM_UFS1)
1850: ((int32_t *)indirdep->ir_savebp->b_data)
1851: [aip->ai_offset] = aip->ai_oldblkno;
1852: else
1853: ((int64_t *)indirdep->ir_savebp->b_data)
1854: [aip->ai_offset] = aip->ai_oldblkno;
1855: FREE_LOCK(&lk);
1856: if (freefrag != NULL)
1857: handle_workitem_freefrag(freefrag);
1858: }
1859: if (newindirdep) {
1860: if (indirdep->ir_savebp != NULL)
1861: brelse(newindirdep->ir_savebp);
1862: WORKITEM_FREE(newindirdep, D_INDIRDEP);
1863: }
1864: if (indirdep)
1865: break;
1866: newindirdep = pool_get(&indirdep_pool, PR_WAITOK);
1867: newindirdep->ir_list.wk_type = D_INDIRDEP;
1868: newindirdep->ir_state = ATTACHED;
1869: if (ip->i_ump->um_fstype == UM_UFS1)
1870: newindirdep->ir_state |= UFS1FMT;
1871: LIST_INIT(&newindirdep->ir_deplisthd);
1872: LIST_INIT(&newindirdep->ir_donehd);
1873: if (bp->b_blkno == bp->b_lblkno) {
1874: VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1875: NULL);
1876: }
1877: newindirdep->ir_savebp =
1878: getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1879: #if 0
1880: BUF_KERNPROC(newindirdep->ir_savebp);
1881: #endif
1882: bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1883: }
1884: }
1885:
1886: /*
1887: * Block de-allocation dependencies.
1888: *
1889: * When blocks are de-allocated, the on-disk pointers must be nullified before
1890: * the blocks are made available for use by other files. (The true
1891: * requirement is that old pointers must be nullified before new on-disk
1892: * pointers are set. We chose this slightly more stringent requirement to
1893: * reduce complexity.) Our implementation handles this dependency by updating
1894: * the inode (or indirect block) appropriately but delaying the actual block
1895: * de-allocation (i.e., freemap and free space count manipulation) until
1896: * after the updated versions reach stable storage. After the disk is
1897: * updated, the blocks can be safely de-allocated whenever it is convenient.
1898: * This implementation handles only the common case of reducing a file's
1899: * length to zero. Other cases are handled by the conventional synchronous
1900: * write approach.
1901: *
1902: * The ffs implementation with which we worked double-checks
1903: * the state of the block pointers and file size as it reduces
1904: * a file's length. Some of this code is replicated here in our
1905: * soft updates implementation. The freeblks->fb_chkcnt field is
1906: * used to transfer a part of this information to the procedure
1907: * that eventually de-allocates the blocks.
1908: *
1909: * This routine should be called from the routine that shortens
1910: * a file's length, before the inode's size or block pointers
1911: * are modified. It will save the block pointer information for
1912: * later release and zero the inode so that the calling routine
1913: * can release it.
1914: */
1915: void
1916: softdep_setup_freeblocks(ip, length)
1917: struct inode *ip; /* The inode whose length is to be reduced */
1918: off_t length; /* The new length for the file */
1919: {
1920: struct freeblks *freeblks;
1921: struct inodedep *inodedep;
1922: struct allocdirect *adp;
1923: struct vnode *vp;
1924: struct buf *bp;
1925: struct fs *fs;
1926: int i, delay, error;
1927:
1928: fs = ip->i_fs;
1929: if (length != 0)
1930: panic("softdep_setup_freeblocks: non-zero length");
1931: freeblks = pool_get(&freeblks_pool, PR_WAITOK);
1932: bzero(freeblks, sizeof(struct freeblks));
1933: freeblks->fb_list.wk_type = D_FREEBLKS;
1934: freeblks->fb_state = ATTACHED;
1935: freeblks->fb_uid = DIP(ip, uid);
1936: freeblks->fb_previousinum = ip->i_number;
1937: freeblks->fb_devvp = ip->i_devvp;
1938: freeblks->fb_mnt = ITOV(ip)->v_mount;
1939: freeblks->fb_oldsize = DIP(ip, size);
1940: freeblks->fb_newsize = length;
1941: freeblks->fb_chkcnt = DIP(ip, blocks);
1942:
1943: for (i = 0; i < NDADDR; i++) {
1944: freeblks->fb_dblks[i] = DIP(ip, db[i]);
1945: DIP_ASSIGN(ip, db[i], 0);
1946: }
1947:
1948: for (i = 0; i < NIADDR; i++) {
1949: freeblks->fb_iblks[i] = DIP(ip, ib[i]);
1950: DIP_ASSIGN(ip, ib[i], 0);
1951: }
1952:
1953: DIP_ASSIGN(ip, blocks, 0);
1954: DIP_ASSIGN(ip, size, 0);
1955:
1956: /*
1957: * Push the zero'ed inode to to its disk buffer so that we are free
1958: * to delete its dependencies below. Once the dependencies are gone
1959: * the buffer can be safely released.
1960: */
1961: if ((error = bread(ip->i_devvp,
1962: fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1963: (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1964: softdep_error("softdep_setup_freeblocks", error);
1965:
1966: if (ip->i_ump->um_fstype == UM_UFS1)
1967: *((struct ufs1_dinode *) bp->b_data +
1968: ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
1969: else
1970: *((struct ufs2_dinode *) bp->b_data +
1971: ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
1972:
1973: /*
1974: * Find and eliminate any inode dependencies.
1975: */
1976: ACQUIRE_LOCK(&lk);
1977: (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1978: if ((inodedep->id_state & IOSTARTED) != 0) {
1979: FREE_LOCK(&lk);
1980: panic("softdep_setup_freeblocks: inode busy");
1981: }
1982: /*
1983: * Add the freeblks structure to the list of operations that
1984: * must await the zero'ed inode being written to disk. If we
1985: * still have a bitmap dependency (delay == 0), then the inode
1986: * has never been written to disk, so we can process the
1987: * freeblks below once we have deleted the dependencies.
1988: */
1989: delay = (inodedep->id_state & DEPCOMPLETE);
1990: if (delay)
1991: WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1992: /*
1993: * Because the file length has been truncated to zero, any
1994: * pending block allocation dependency structures associated
1995: * with this inode are obsolete and can simply be de-allocated.
1996: * We must first merge the two dependency lists to get rid of
1997: * any duplicate freefrag structures, then purge the merged list.
1998: * If we still have a bitmap dependency, then the inode has never
1999: * been written to disk, so we can free any fragments without delay.
2000: */
2001: merge_inode_lists(inodedep);
2002: while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2003: free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2004: FREE_LOCK(&lk);
2005: bdwrite(bp);
2006: /*
2007: * We must wait for any I/O in progress to finish so that
2008: * all potential buffers on the dirty list will be visible.
2009: * Once they are all there, walk the list and get rid of
2010: * any dependencies.
2011: */
2012: vp = ITOV(ip);
2013: ACQUIRE_LOCK(&lk);
2014: drain_output(vp, 1);
2015: while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2016: if (!getdirtybuf(bp, MNT_WAIT))
2017: break;
2018: (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
2019: deallocate_dependencies(bp, inodedep);
2020: bp->b_flags |= B_INVAL | B_NOCACHE;
2021: FREE_LOCK(&lk);
2022: brelse(bp);
2023: ACQUIRE_LOCK(&lk);
2024: }
2025: if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
2026: (void) free_inodedep(inodedep);
2027:
2028: if (delay) {
2029: freeblks->fb_state |= DEPCOMPLETE;
2030: /*
2031: * If the inode with zeroed block pointers is now on disk we
2032: * can start freeing blocks. Add freeblks to the worklist
2033: * instead of calling handle_workitem_freeblocks() directly as
2034: * it is more likely that additional IO is needed to complete
2035: * the request than in the !delay case.
2036: */
2037: if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2038: add_to_worklist(&freeblks->fb_list);
2039: }
2040:
2041: FREE_LOCK(&lk);
2042: /*
2043: * If the inode has never been written to disk (delay == 0),
2044: * then we can process the freeblks now that we have deleted
2045: * the dependencies.
2046: */
2047: if (!delay)
2048: handle_workitem_freeblocks(freeblks);
2049: }
2050:
2051: /*
2052: * Reclaim any dependency structures from a buffer that is about to
2053: * be reallocated to a new vnode. The buffer must be locked, thus,
2054: * no I/O completion operations can occur while we are manipulating
2055: * its associated dependencies. The mutex is held so that other I/O's
2056: * associated with related dependencies do not occur.
2057: */
2058: STATIC void
2059: deallocate_dependencies(bp, inodedep)
2060: struct buf *bp;
2061: struct inodedep *inodedep;
2062: {
2063: struct worklist *wk;
2064: struct indirdep *indirdep;
2065: struct allocindir *aip;
2066: struct pagedep *pagedep;
2067: struct dirrem *dirrem;
2068: struct diradd *dap;
2069: int i;
2070:
2071: while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2072: switch (wk->wk_type) {
2073:
2074: case D_INDIRDEP:
2075: indirdep = WK_INDIRDEP(wk);
2076: /*
2077: * None of the indirect pointers will ever be visible,
2078: * so they can simply be tossed. GOINGAWAY ensures
2079: * that allocated pointers will be saved in the buffer
2080: * cache until they are freed. Note that they will
2081: * only be able to be found by their physical address
2082: * since the inode mapping the logical address will
2083: * be gone. The save buffer used for the safe copy
2084: * was allocated in setup_allocindir_phase2 using
2085: * the physical address so it could be used for this
2086: * purpose. Hence we swap the safe copy with the real
2087: * copy, allowing the safe copy to be freed and holding
2088: * on to the real copy for later use in indir_trunc.
2089: */
2090: if (indirdep->ir_state & GOINGAWAY) {
2091: FREE_LOCK(&lk);
2092: panic("deallocate_dependencies: already gone");
2093: }
2094: indirdep->ir_state |= GOINGAWAY;
2095: while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2096: free_allocindir(aip, inodedep);
2097: if (bp->b_lblkno >= 0 ||
2098: bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
2099: FREE_LOCK(&lk);
2100: panic("deallocate_dependencies: not indir");
2101: }
2102: bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2103: bp->b_bcount);
2104: WORKLIST_REMOVE(wk);
2105: WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2106: continue;
2107:
2108: case D_PAGEDEP:
2109: pagedep = WK_PAGEDEP(wk);
2110: /*
2111: * None of the directory additions will ever be
2112: * visible, so they can simply be tossed.
2113: */
2114: for (i = 0; i < DAHASHSZ; i++)
2115: while ((dap =
2116: LIST_FIRST(&pagedep->pd_diraddhd[i])))
2117: free_diradd(dap);
2118: while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2119: free_diradd(dap);
2120: /*
2121: * Copy any directory remove dependencies to the list
2122: * to be processed after the zero'ed inode is written.
2123: * If the inode has already been written, then they
2124: * can be dumped directly onto the work list.
2125: */
2126: while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd))) {
2127: LIST_REMOVE(dirrem, dm_next);
2128: dirrem->dm_dirinum = pagedep->pd_ino;
2129: if (inodedep == NULL ||
2130: (inodedep->id_state & ALLCOMPLETE) ==
2131: ALLCOMPLETE)
2132: add_to_worklist(&dirrem->dm_list);
2133: else
2134: WORKLIST_INSERT(&inodedep->id_bufwait,
2135: &dirrem->dm_list);
2136: }
2137: if ((pagedep->pd_state & NEWBLOCK) != 0) {
2138: LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2139: if (wk->wk_type == D_NEWDIRBLK &&
2140: WK_NEWDIRBLK(wk)->db_pagedep ==
2141: pagedep)
2142: break;
2143: if (wk != NULL) {
2144: WORKLIST_REMOVE(wk);
2145: free_newdirblk(WK_NEWDIRBLK(wk));
2146: } else {
2147: FREE_LOCK(&lk);
2148: panic("deallocate_dependencies: "
2149: "lost pagedep");
2150: }
2151: }
2152: WORKLIST_REMOVE(&pagedep->pd_list);
2153: LIST_REMOVE(pagedep, pd_hash);
2154: WORKITEM_FREE(pagedep, D_PAGEDEP);
2155: continue;
2156:
2157: case D_ALLOCINDIR:
2158: free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2159: continue;
2160:
2161: case D_ALLOCDIRECT:
2162: case D_INODEDEP:
2163: FREE_LOCK(&lk);
2164: panic("deallocate_dependencies: Unexpected type %s",
2165: TYPENAME(wk->wk_type));
2166: /* NOTREACHED */
2167:
2168: default:
2169: FREE_LOCK(&lk);
2170: panic("deallocate_dependencies: Unknown type %s",
2171: TYPENAME(wk->wk_type));
2172: /* NOTREACHED */
2173: }
2174: }
2175: }
2176:
2177: /*
2178: * Free an allocdirect. Generate a new freefrag work request if appropriate.
2179: * This routine must be called with splbio interrupts blocked.
2180: */
2181: STATIC void
2182: free_allocdirect(adphead, adp, delay)
2183: struct allocdirectlst *adphead;
2184: struct allocdirect *adp;
2185: int delay;
2186: {
2187: struct newdirblk *newdirblk;
2188: struct worklist *wk;
2189:
2190: splassert(IPL_BIO);
2191:
2192: #ifdef DEBUG
2193: if (lk.lkt_held == -1)
2194: panic("free_allocdirect: lock not held");
2195: #endif
2196: if ((adp->ad_state & DEPCOMPLETE) == 0)
2197: LIST_REMOVE(adp, ad_deps);
2198: TAILQ_REMOVE(adphead, adp, ad_next);
2199: if ((adp->ad_state & COMPLETE) == 0)
2200: WORKLIST_REMOVE(&adp->ad_list);
2201: if (adp->ad_freefrag != NULL) {
2202: if (delay)
2203: WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2204: &adp->ad_freefrag->ff_list);
2205: else
2206: add_to_worklist(&adp->ad_freefrag->ff_list);
2207: }
2208: if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2209: newdirblk = WK_NEWDIRBLK(wk);
2210: WORKLIST_REMOVE(&newdirblk->db_list);
2211: if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
2212: panic("free_allocdirect: extra newdirblk");
2213: if (delay)
2214: WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2215: &newdirblk->db_list);
2216: else
2217: free_newdirblk(newdirblk);
2218: }
2219: WORKITEM_FREE(adp, D_ALLOCDIRECT);
2220: }
2221:
2222: /*
2223: * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2224: * This routine must be called with splbio interrupts blocked.
2225: */
2226: void
2227: free_newdirblk(newdirblk)
2228: struct newdirblk *newdirblk;
2229: {
2230: struct pagedep *pagedep;
2231: struct diradd *dap;
2232: int i;
2233:
2234: splassert(IPL_BIO);
2235:
2236: #ifdef DEBUG
2237: if (lk.lkt_held == -1)
2238: panic("free_newdirblk: lock not held");
2239: #endif
2240: /*
2241: * If the pagedep is still linked onto the directory buffer
2242: * dependency chain, then some of the entries on the
2243: * pd_pendinghd list may not be committed to disk yet. In
2244: * this case, we will simply clear the NEWBLOCK flag and
2245: * let the pd_pendinghd list be processed when the pagedep
2246: * is next written. If the pagedep is no longer on the buffer
2247: * dependency chain, then all the entries on the pd_pending
2248: * list are committed to disk and we can free them here.
2249: */
2250: pagedep = newdirblk->db_pagedep;
2251: pagedep->pd_state &= ~NEWBLOCK;
2252: if ((pagedep->pd_state & ONWORKLIST) == 0)
2253: while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2254: free_diradd(dap);
2255: /*
2256: * If no dependencies remain, the pagedep will be freed.
2257: */
2258: for (i = 0; i < DAHASHSZ; i++)
2259: if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
2260: break;
2261: if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2262: LIST_REMOVE(pagedep, pd_hash);
2263: WORKITEM_FREE(pagedep, D_PAGEDEP);
2264: }
2265: WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2266: }
2267:
2268: /*
2269: * Prepare an inode to be freed. The actual free operation is not
2270: * done until the zero'ed inode has been written to disk.
2271: */
2272: void
2273: softdep_freefile(pvp, ino, mode)
2274: struct vnode *pvp;
2275: ino_t ino;
2276: mode_t mode;
2277: {
2278: struct inode *ip = VTOI(pvp);
2279: struct inodedep *inodedep;
2280: struct freefile *freefile;
2281:
2282: /*
2283: * This sets up the inode de-allocation dependency.
2284: */
2285: freefile = pool_get(&freefile_pool, PR_WAITOK);
2286: freefile->fx_list.wk_type = D_FREEFILE;
2287: freefile->fx_list.wk_state = 0;
2288: freefile->fx_mode = mode;
2289: freefile->fx_oldinum = ino;
2290: freefile->fx_devvp = ip->i_devvp;
2291: freefile->fx_mnt = ITOV(ip)->v_mount;
2292:
2293: /*
2294: * If the inodedep does not exist, then the zero'ed inode has
2295: * been written to disk. If the allocated inode has never been
2296: * written to disk, then the on-disk inode is zero'ed. In either
2297: * case we can free the file immediately.
2298: */
2299: ACQUIRE_LOCK(&lk);
2300: if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2301: check_inode_unwritten(inodedep)) {
2302: FREE_LOCK(&lk);
2303: handle_workitem_freefile(freefile);
2304: return;
2305: }
2306: WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2307: FREE_LOCK(&lk);
2308: }
2309:
2310: /*
2311: * Check to see if an inode has never been written to disk. If
2312: * so free the inodedep and return success, otherwise return failure.
2313: * This routine must be called with splbio interrupts blocked.
2314: *
2315: * If we still have a bitmap dependency, then the inode has never
2316: * been written to disk. Drop the dependency as it is no longer
2317: * necessary since the inode is being deallocated. We set the
2318: * ALLCOMPLETE flags since the bitmap now properly shows that the
2319: * inode is not allocated. Even if the inode is actively being
2320: * written, it has been rolled back to its zero'ed state, so we
2321: * are ensured that a zero inode is what is on the disk. For short
2322: * lived files, this change will usually result in removing all the
2323: * dependencies from the inode so that it can be freed immediately.
2324: */
2325: STATIC int
2326: check_inode_unwritten(inodedep)
2327: struct inodedep *inodedep;
2328: {
2329: splassert(IPL_BIO);
2330:
2331: if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2332: LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2333: LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2334: LIST_FIRST(&inodedep->id_inowait) != NULL ||
2335: TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2336: TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2337: inodedep->id_nlinkdelta != 0)
2338: return (0);
2339: inodedep->id_state |= ALLCOMPLETE;
2340: LIST_REMOVE(inodedep, id_deps);
2341: inodedep->id_buf = NULL;
2342: if (inodedep->id_state & ONWORKLIST)
2343: WORKLIST_REMOVE(&inodedep->id_list);
2344: if (inodedep->id_savedino1 != NULL) {
2345: FREE(inodedep->id_savedino1, M_INODEDEP);
2346: inodedep->id_savedino1 = NULL;
2347: }
2348: if (free_inodedep(inodedep) == 0) {
2349: FREE_LOCK(&lk);
2350: panic("check_inode_unwritten: busy inode");
2351: }
2352: return (1);
2353: }
2354:
2355: /*
2356: * Try to free an inodedep structure. Return 1 if it could be freed.
2357: */
2358: STATIC int
2359: free_inodedep(inodedep)
2360: struct inodedep *inodedep;
2361: {
2362:
2363: if ((inodedep->id_state & ONWORKLIST) != 0 ||
2364: (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2365: LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2366: LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2367: LIST_FIRST(&inodedep->id_inowait) != NULL ||
2368: TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2369: TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2370: inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2371: return (0);
2372: LIST_REMOVE(inodedep, id_hash);
2373: WORKITEM_FREE(inodedep, D_INODEDEP);
2374: num_inodedep -= 1;
2375: return (1);
2376: }
2377:
2378: /*
2379: * This workitem routine performs the block de-allocation.
2380: * The workitem is added to the pending list after the updated
2381: * inode block has been written to disk. As mentioned above,
2382: * checks regarding the number of blocks de-allocated (compared
2383: * to the number of blocks allocated for the file) are also
2384: * performed in this function.
2385: */
2386: STATIC void
2387: handle_workitem_freeblocks(freeblks)
2388: struct freeblks *freeblks;
2389: {
2390: struct inode tip;
2391: daddr_t bn;
2392: union {
2393: struct ufs1_dinode di1;
2394: struct ufs2_dinode di2;
2395: } di;
2396: struct fs *fs;
2397: int i, level, bsize;
2398: long nblocks, blocksreleased = 0;
2399: int error, allerror = 0;
2400: daddr64_t baselbns[NIADDR], tmpval;
2401:
2402: if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UM_UFS1)
2403: tip.i_din1 = &di.di1;
2404: else
2405: tip.i_din2 = &di.di2;
2406:
2407: tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2408: tip.i_number = freeblks->fb_previousinum;
2409: tip.i_ump = VFSTOUFS(freeblks->fb_mnt);
2410: tip.i_dev = freeblks->fb_devvp->v_rdev;
2411: DIP_ASSIGN(&tip, size, freeblks->fb_oldsize);
2412: DIP_ASSIGN(&tip, uid, freeblks->fb_uid);
2413: tip.i_vnode = NULL;
2414: tmpval = 1;
2415: baselbns[0] = NDADDR;
2416: for (i = 1; i < NIADDR; i++) {
2417: tmpval *= NINDIR(fs);
2418: baselbns[i] = baselbns[i - 1] + tmpval;
2419: }
2420: nblocks = btodb(fs->fs_bsize);
2421: blocksreleased = 0;
2422: /*
2423: * Indirect blocks first.
2424: */
2425: for (level = (NIADDR - 1); level >= 0; level--) {
2426: if ((bn = freeblks->fb_iblks[level]) == 0)
2427: continue;
2428: if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
2429: baselbns[level], &blocksreleased)) != 0)
2430: allerror = error;
2431: ffs_blkfree(&tip, bn, fs->fs_bsize);
2432: blocksreleased += nblocks;
2433: }
2434: /*
2435: * All direct blocks or frags.
2436: */
2437: for (i = (NDADDR - 1); i >= 0; i--) {
2438: if ((bn = freeblks->fb_dblks[i]) == 0)
2439: continue;
2440: bsize = blksize(fs, &tip, i);
2441: ffs_blkfree(&tip, bn, bsize);
2442: blocksreleased += btodb(bsize);
2443: }
2444:
2445: #ifdef DIAGNOSTIC
2446: if (freeblks->fb_chkcnt != blocksreleased)
2447: printf("handle_workitem_freeblocks: block count\n");
2448: if (allerror)
2449: softdep_error("handle_workitem_freeblks", allerror);
2450: #endif /* DIAGNOSTIC */
2451: WORKITEM_FREE(freeblks, D_FREEBLKS);
2452: }
2453:
2454: /*
2455: * Release blocks associated with the inode ip and stored in the indirect
2456: * block dbn. If level is greater than SINGLE, the block is an indirect block
2457: * and recursive calls to indirtrunc must be used to cleanse other indirect
2458: * blocks.
2459: */
2460: STATIC int
2461: indir_trunc(ip, dbn, level, lbn, countp)
2462: struct inode *ip;
2463: daddr_t dbn;
2464: int level;
2465: daddr64_t lbn;
2466: long *countp;
2467: {
2468: struct buf *bp;
2469: int32_t *bap1 = NULL;
2470: int64_t nb, *bap2 = NULL;
2471: struct fs *fs;
2472: struct worklist *wk;
2473: struct indirdep *indirdep;
2474: int i, lbnadd, nblocks, ufs1fmt;
2475: int error, allerror = 0;
2476:
2477: fs = ip->i_fs;
2478: lbnadd = 1;
2479: for (i = level; i > 0; i--)
2480: lbnadd *= NINDIR(fs);
2481: /*
2482: * Get buffer of block pointers to be freed. This routine is not
2483: * called until the zero'ed inode has been written, so it is safe
2484: * to free blocks as they are encountered. Because the inode has
2485: * been zero'ed, calls to bmap on these blocks will fail. So, we
2486: * have to use the on-disk address and the block device for the
2487: * filesystem to look them up. If the file was deleted before its
2488: * indirect blocks were all written to disk, the routine that set
2489: * us up (deallocate_dependencies) will have arranged to leave
2490: * a complete copy of the indirect block in memory for our use.
2491: * Otherwise we have to read the blocks in from the disk.
2492: */
2493: ACQUIRE_LOCK(&lk);
2494: if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2495: (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2496: if (wk->wk_type != D_INDIRDEP ||
2497: (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2498: (indirdep->ir_state & GOINGAWAY) == 0) {
2499: FREE_LOCK(&lk);
2500: panic("indir_trunc: lost indirdep");
2501: }
2502: WORKLIST_REMOVE(wk);
2503: WORKITEM_FREE(indirdep, D_INDIRDEP);
2504: if (LIST_FIRST(&bp->b_dep) != NULL) {
2505: FREE_LOCK(&lk);
2506: panic("indir_trunc: dangling dep");
2507: }
2508: FREE_LOCK(&lk);
2509: } else {
2510: FREE_LOCK(&lk);
2511: error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2512: if (error)
2513: return (error);
2514: }
2515: /*
2516: * Recursively free indirect blocks.
2517: */
2518: if (ip->i_ump->um_fstype == UM_UFS1) {
2519: ufs1fmt = 1;
2520: bap1 = (int32_t *)bp->b_data;
2521: } else {
2522: ufs1fmt = 0;
2523: bap2 = (int64_t *)bp->b_data;
2524: }
2525: nblocks = btodb(fs->fs_bsize);
2526: for (i = NINDIR(fs) - 1; i >= 0; i--) {
2527: if (ufs1fmt)
2528: nb = bap1[i];
2529: else
2530: nb = bap2[i];
2531: if (nb == 0)
2532: continue;
2533: if (level != 0) {
2534: if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2535: level - 1, lbn + (i * lbnadd), countp)) != 0)
2536: allerror = error;
2537: }
2538: ffs_blkfree(ip, nb, fs->fs_bsize);
2539: *countp += nblocks;
2540: }
2541: bp->b_flags |= B_INVAL | B_NOCACHE;
2542: brelse(bp);
2543: return (allerror);
2544: }
2545:
2546: /*
2547: * Free an allocindir.
2548: * This routine must be called with splbio interrupts blocked.
2549: */
2550: STATIC void
2551: free_allocindir(aip, inodedep)
2552: struct allocindir *aip;
2553: struct inodedep *inodedep;
2554: {
2555: struct freefrag *freefrag;
2556:
2557: splassert(IPL_BIO);
2558:
2559: #ifdef DEBUG
2560: if (lk.lkt_held == -1)
2561: panic("free_allocindir: lock not held");
2562: #endif
2563: if ((aip->ai_state & DEPCOMPLETE) == 0)
2564: LIST_REMOVE(aip, ai_deps);
2565: if (aip->ai_state & ONWORKLIST)
2566: WORKLIST_REMOVE(&aip->ai_list);
2567: LIST_REMOVE(aip, ai_next);
2568: if ((freefrag = aip->ai_freefrag) != NULL) {
2569: if (inodedep == NULL)
2570: add_to_worklist(&freefrag->ff_list);
2571: else
2572: WORKLIST_INSERT(&inodedep->id_bufwait,
2573: &freefrag->ff_list);
2574: }
2575: WORKITEM_FREE(aip, D_ALLOCINDIR);
2576: }
2577:
2578: /*
2579: * Directory entry addition dependencies.
2580: *
2581: * When adding a new directory entry, the inode (with its incremented link
2582: * count) must be written to disk before the directory entry's pointer to it.
2583: * Also, if the inode is newly allocated, the corresponding freemap must be
2584: * updated (on disk) before the directory entry's pointer. These requirements
2585: * are met via undo/redo on the directory entry's pointer, which consists
2586: * simply of the inode number.
2587: *
2588: * As directory entries are added and deleted, the free space within a
2589: * directory block can become fragmented. The ufs file system will compact
2590: * a fragmented directory block to make space for a new entry. When this
2591: * occurs, the offsets of previously added entries change. Any "diradd"
2592: * dependency structures corresponding to these entries must be updated with
2593: * the new offsets.
2594: */
2595:
2596: /*
2597: * This routine is called after the in-memory inode's link
2598: * count has been incremented, but before the directory entry's
2599: * pointer to the inode has been set.
2600: */
2601: int
2602: softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2603: struct buf *bp; /* buffer containing directory block */
2604: struct inode *dp; /* inode for directory */
2605: off_t diroffset; /* offset of new entry in directory */
2606: long newinum; /* inode referenced by new directory entry */
2607: struct buf *newdirbp; /* non-NULL => contents of new mkdir */
2608: int isnewblk; /* entry is in a newly allocated block */
2609: {
2610: int offset; /* offset of new entry within directory block */
2611: daddr64_t lbn; /* block in directory containing new entry */
2612: struct fs *fs;
2613: struct diradd *dap;
2614: struct allocdirect *adp;
2615: struct pagedep *pagedep;
2616: struct inodedep *inodedep;
2617: struct newdirblk *newdirblk = NULL;
2618: struct mkdir *mkdir1, *mkdir2;
2619:
2620:
2621: fs = dp->i_fs;
2622: lbn = lblkno(fs, diroffset);
2623: offset = blkoff(fs, diroffset);
2624: dap = pool_get(&diradd_pool, PR_WAITOK);
2625: bzero(dap,sizeof(struct diradd));
2626: dap->da_list.wk_type = D_DIRADD;
2627: dap->da_offset = offset;
2628: dap->da_newinum = newinum;
2629: dap->da_state = ATTACHED;
2630: if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2631: newdirblk = pool_get(&newdirblk_pool, PR_WAITOK);
2632: newdirblk->db_list.wk_type = D_NEWDIRBLK;
2633: newdirblk->db_state = 0;
2634: }
2635: if (newdirbp == NULL) {
2636: dap->da_state |= DEPCOMPLETE;
2637: ACQUIRE_LOCK(&lk);
2638: } else {
2639: dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2640: mkdir1 = pool_get(&mkdir_pool, PR_WAITOK);
2641: mkdir1->md_list.wk_type = D_MKDIR;
2642: mkdir1->md_state = MKDIR_BODY;
2643: mkdir1->md_diradd = dap;
2644: mkdir2 = pool_get(&mkdir_pool, PR_WAITOK);
2645: mkdir2->md_list.wk_type = D_MKDIR;
2646: mkdir2->md_state = MKDIR_PARENT;
2647: mkdir2->md_diradd = dap;
2648: /*
2649: * Dependency on "." and ".." being written to disk.
2650: */
2651: mkdir1->md_buf = newdirbp;
2652: ACQUIRE_LOCK(&lk);
2653: LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2654: WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2655: FREE_LOCK(&lk);
2656: bdwrite(newdirbp);
2657: /*
2658: * Dependency on link count increase for parent directory
2659: */
2660: ACQUIRE_LOCK(&lk);
2661: if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
2662: || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2663: dap->da_state &= ~MKDIR_PARENT;
2664: WORKITEM_FREE(mkdir2, D_MKDIR);
2665: } else {
2666: LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2667: WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2668: }
2669: }
2670: /*
2671: * Link into parent directory pagedep to await its being written.
2672: */
2673: if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2674: WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2675: dap->da_pagedep = pagedep;
2676: LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2677: da_pdlist);
2678: /*
2679: * Link into its inodedep. Put it on the id_bufwait list if the inode
2680: * is not yet written. If it is written, do the post-inode write
2681: * processing to put it on the id_pendinghd list.
2682: */
2683: (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2684: if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2685: diradd_inode_written(dap, inodedep);
2686: else
2687: WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2688: if (isnewblk) {
2689: /*
2690: * Directories growing into indirect blocks are rare
2691: * enough and the frequency of new block allocation
2692: * in those cases even more rare, that we choose not
2693: * to bother tracking them. Rather we simply force the
2694: * new directory entry to disk.
2695: */
2696: if (lbn >= NDADDR) {
2697: FREE_LOCK(&lk);
2698: /*
2699: * We only have a new allocation when at the
2700: * beginning of a new block, not when we are
2701: * expanding into an existing block.
2702: */
2703: if (blkoff(fs, diroffset) == 0)
2704: return (1);
2705: return (0);
2706: }
2707: /*
2708: * We only have a new allocation when at the beginning
2709: * of a new fragment, not when we are expanding into an
2710: * existing fragment. Also, there is nothing to do if we
2711: * are already tracking this block.
2712: */
2713: if (fragoff(fs, diroffset) != 0) {
2714: FREE_LOCK(&lk);
2715: return (0);
2716: }
2717:
2718: if ((pagedep->pd_state & NEWBLOCK) != 0) {
2719: WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2720: FREE_LOCK(&lk);
2721: return (0);
2722: }
2723: /*
2724: * Find our associated allocdirect and have it track us.
2725: */
2726: if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
2727: panic("softdep_setup_directory_add: lost inodedep");
2728: adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
2729: if (adp == NULL || adp->ad_lbn != lbn) {
2730: FREE_LOCK(&lk);
2731: panic("softdep_setup_directory_add: lost entry");
2732: }
2733: pagedep->pd_state |= NEWBLOCK;
2734: newdirblk->db_pagedep = pagedep;
2735: WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
2736: }
2737: FREE_LOCK(&lk);
2738: return (0);
2739: }
2740:
2741: /*
2742: * This procedure is called to change the offset of a directory
2743: * entry when compacting a directory block which must be owned
2744: * exclusively by the caller. Note that the actual entry movement
2745: * must be done in this procedure to ensure that no I/O completions
2746: * occur while the move is in progress.
2747: */
2748: void
2749: softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2750: struct inode *dp; /* inode for directory */
2751: caddr_t base; /* address of dp->i_offset */
2752: caddr_t oldloc; /* address of old directory location */
2753: caddr_t newloc; /* address of new directory location */
2754: int entrysize; /* size of directory entry */
2755: {
2756: int offset, oldoffset, newoffset;
2757: struct pagedep *pagedep;
2758: struct diradd *dap;
2759: daddr64_t lbn;
2760:
2761: ACQUIRE_LOCK(&lk);
2762: lbn = lblkno(dp->i_fs, dp->i_offset);
2763: offset = blkoff(dp->i_fs, dp->i_offset);
2764: if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2765: goto done;
2766: oldoffset = offset + (oldloc - base);
2767: newoffset = offset + (newloc - base);
2768:
2769: LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2770: if (dap->da_offset != oldoffset)
2771: continue;
2772: dap->da_offset = newoffset;
2773: if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2774: break;
2775: LIST_REMOVE(dap, da_pdlist);
2776: LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2777: dap, da_pdlist);
2778: break;
2779: }
2780: if (dap == NULL) {
2781:
2782: LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2783: if (dap->da_offset == oldoffset) {
2784: dap->da_offset = newoffset;
2785: break;
2786: }
2787: }
2788: }
2789: done:
2790: bcopy(oldloc, newloc, entrysize);
2791: FREE_LOCK(&lk);
2792: }
2793:
2794: /*
2795: * Free a diradd dependency structure. This routine must be called
2796: * with splbio interrupts blocked.
2797: */
2798: STATIC void
2799: free_diradd(dap)
2800: struct diradd *dap;
2801: {
2802: struct dirrem *dirrem;
2803: struct pagedep *pagedep;
2804: struct inodedep *inodedep;
2805: struct mkdir *mkdir, *nextmd;
2806:
2807: splassert(IPL_BIO);
2808:
2809: #ifdef DEBUG
2810: if (lk.lkt_held == -1)
2811: panic("free_diradd: lock not held");
2812: #endif
2813: WORKLIST_REMOVE(&dap->da_list);
2814: LIST_REMOVE(dap, da_pdlist);
2815: if ((dap->da_state & DIRCHG) == 0) {
2816: pagedep = dap->da_pagedep;
2817: } else {
2818: dirrem = dap->da_previous;
2819: pagedep = dirrem->dm_pagedep;
2820: dirrem->dm_dirinum = pagedep->pd_ino;
2821: add_to_worklist(&dirrem->dm_list);
2822: }
2823: if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2824: 0, &inodedep) != 0)
2825: (void) free_inodedep(inodedep);
2826: if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2827: for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2828: nextmd = LIST_NEXT(mkdir, md_mkdirs);
2829: if (mkdir->md_diradd != dap)
2830: continue;
2831: dap->da_state &= ~mkdir->md_state;
2832: WORKLIST_REMOVE(&mkdir->md_list);
2833: LIST_REMOVE(mkdir, md_mkdirs);
2834: WORKITEM_FREE(mkdir, D_MKDIR);
2835: }
2836: if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2837: FREE_LOCK(&lk);
2838: panic("free_diradd: unfound ref");
2839: }
2840: }
2841: WORKITEM_FREE(dap, D_DIRADD);
2842: }
2843:
2844: /*
2845: * Directory entry removal dependencies.
2846: *
2847: * When removing a directory entry, the entry's inode pointer must be
2848: * zero'ed on disk before the corresponding inode's link count is decremented
2849: * (possibly freeing the inode for re-use). This dependency is handled by
2850: * updating the directory entry but delaying the inode count reduction until
2851: * after the directory block has been written to disk. After this point, the
2852: * inode count can be decremented whenever it is convenient.
2853: */
2854:
2855: /*
2856: * This routine should be called immediately after removing
2857: * a directory entry. The inode's link count should not be
2858: * decremented by the calling procedure -- the soft updates
2859: * code will do this task when it is safe.
2860: */
2861: void
2862: softdep_setup_remove(bp, dp, ip, isrmdir)
2863: struct buf *bp; /* buffer containing directory block */
2864: struct inode *dp; /* inode for the directory being modified */
2865: struct inode *ip; /* inode for directory entry being removed */
2866: int isrmdir; /* indicates if doing RMDIR */
2867: {
2868: struct dirrem *dirrem, *prevdirrem;
2869:
2870: /*
2871: * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2872: */
2873: dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2874:
2875: /*
2876: * If the COMPLETE flag is clear, then there were no active
2877: * entries and we want to roll back to a zeroed entry until
2878: * the new inode is committed to disk. If the COMPLETE flag is
2879: * set then we have deleted an entry that never made it to
2880: * disk. If the entry we deleted resulted from a name change,
2881: * then the old name still resides on disk. We cannot delete
2882: * its inode (returned to us in prevdirrem) until the zeroed
2883: * directory entry gets to disk. The new inode has never been
2884: * referenced on the disk, so can be deleted immediately.
2885: */
2886: if ((dirrem->dm_state & COMPLETE) == 0) {
2887: LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2888: dm_next);
2889: FREE_LOCK(&lk);
2890: } else {
2891: if (prevdirrem != NULL)
2892: LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2893: prevdirrem, dm_next);
2894: dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2895: FREE_LOCK(&lk);
2896: handle_workitem_remove(dirrem);
2897: }
2898: }
2899:
2900: /*
2901: * Allocate a new dirrem if appropriate and return it along with
2902: * its associated pagedep. Called without a lock, returns with lock.
2903: */
2904: STATIC long num_dirrem; /* number of dirrem allocated */
2905: STATIC struct dirrem *
2906: newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2907: struct buf *bp; /* buffer containing directory block */
2908: struct inode *dp; /* inode for the directory being modified */
2909: struct inode *ip; /* inode for directory entry being removed */
2910: int isrmdir; /* indicates if doing RMDIR */
2911: struct dirrem **prevdirremp; /* previously referenced inode, if any */
2912: {
2913: int offset;
2914: daddr64_t lbn;
2915: struct diradd *dap;
2916: struct dirrem *dirrem;
2917: struct pagedep *pagedep;
2918:
2919: /*
2920: * Whiteouts have no deletion dependencies.
2921: */
2922: if (ip == NULL)
2923: panic("newdirrem: whiteout");
2924: /*
2925: * If we are over our limit, try to improve the situation.
2926: * Limiting the number of dirrem structures will also limit
2927: * the number of freefile and freeblks structures.
2928: */
2929: if (num_dirrem > max_softdeps / 2)
2930: (void) request_cleanup(FLUSH_REMOVE, 0);
2931: num_dirrem += 1;
2932: dirrem = pool_get(&dirrem_pool, PR_WAITOK);
2933: bzero(dirrem,sizeof(struct dirrem));
2934: dirrem->dm_list.wk_type = D_DIRREM;
2935: dirrem->dm_state = isrmdir ? RMDIR : 0;
2936: dirrem->dm_mnt = ITOV(ip)->v_mount;
2937: dirrem->dm_oldinum = ip->i_number;
2938: *prevdirremp = NULL;
2939:
2940: ACQUIRE_LOCK(&lk);
2941: lbn = lblkno(dp->i_fs, dp->i_offset);
2942: offset = blkoff(dp->i_fs, dp->i_offset);
2943: if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2944: WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2945: dirrem->dm_pagedep = pagedep;
2946: /*
2947: * Check for a diradd dependency for the same directory entry.
2948: * If present, then both dependencies become obsolete and can
2949: * be de-allocated. Check for an entry on both the pd_dirraddhd
2950: * list and the pd_pendinghd list.
2951: */
2952:
2953: LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2954: if (dap->da_offset == offset)
2955: break;
2956: if (dap == NULL) {
2957:
2958: LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2959: if (dap->da_offset == offset)
2960: break;
2961: if (dap == NULL)
2962: return (dirrem);
2963: }
2964: /*
2965: * Must be ATTACHED at this point.
2966: */
2967: if ((dap->da_state & ATTACHED) == 0) {
2968: FREE_LOCK(&lk);
2969: panic("newdirrem: not ATTACHED");
2970: }
2971: if (dap->da_newinum != ip->i_number) {
2972: FREE_LOCK(&lk);
2973: panic("newdirrem: inum %d should be %d",
2974: ip->i_number, dap->da_newinum);
2975: }
2976: /*
2977: * If we are deleting a changed name that never made it to disk,
2978: * then return the dirrem describing the previous inode (which
2979: * represents the inode currently referenced from this entry on disk).
2980: */
2981: if ((dap->da_state & DIRCHG) != 0) {
2982: *prevdirremp = dap->da_previous;
2983: dap->da_state &= ~DIRCHG;
2984: dap->da_pagedep = pagedep;
2985: }
2986: /*
2987: * We are deleting an entry that never made it to disk.
2988: * Mark it COMPLETE so we can delete its inode immediately.
2989: */
2990: dirrem->dm_state |= COMPLETE;
2991: free_diradd(dap);
2992: return (dirrem);
2993: }
2994:
2995: /*
2996: * Directory entry change dependencies.
2997: *
2998: * Changing an existing directory entry requires that an add operation
2999: * be completed first followed by a deletion. The semantics for the addition
3000: * are identical to the description of adding a new entry above except
3001: * that the rollback is to the old inode number rather than zero. Once
3002: * the addition dependency is completed, the removal is done as described
3003: * in the removal routine above.
3004: */
3005:
3006: /*
3007: * This routine should be called immediately after changing
3008: * a directory entry. The inode's link count should not be
3009: * decremented by the calling procedure -- the soft updates
3010: * code will perform this task when it is safe.
3011: */
3012: void
3013: softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3014: struct buf *bp; /* buffer containing directory block */
3015: struct inode *dp; /* inode for the directory being modified */
3016: struct inode *ip; /* inode for directory entry being removed */
3017: long newinum; /* new inode number for changed entry */
3018: int isrmdir; /* indicates if doing RMDIR */
3019: {
3020: int offset;
3021: struct diradd *dap = NULL;
3022: struct dirrem *dirrem, *prevdirrem;
3023: struct pagedep *pagedep;
3024: struct inodedep *inodedep;
3025:
3026: offset = blkoff(dp->i_fs, dp->i_offset);
3027: dap = pool_get(&diradd_pool, PR_WAITOK);
3028: bzero(dap,sizeof(struct diradd));
3029: dap->da_list.wk_type = D_DIRADD;
3030: dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3031: dap->da_offset = offset;
3032: dap->da_newinum = newinum;
3033:
3034: /*
3035: * Allocate a new dirrem and ACQUIRE_LOCK.
3036: */
3037: dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3038: pagedep = dirrem->dm_pagedep;
3039: /*
3040: * The possible values for isrmdir:
3041: * 0 - non-directory file rename
3042: * 1 - directory rename within same directory
3043: * inum - directory rename to new directory of given inode number
3044: * When renaming to a new directory, we are both deleting and
3045: * creating a new directory entry, so the link count on the new
3046: * directory should not change. Thus we do not need the followup
3047: * dirrem which is usually done in handle_workitem_remove. We set
3048: * the DIRCHG flag to tell handle_workitem_remove to skip the
3049: * followup dirrem.
3050: */
3051: if (isrmdir > 1)
3052: dirrem->dm_state |= DIRCHG;
3053:
3054: /*
3055: * If the COMPLETE flag is clear, then there were no active
3056: * entries and we want to roll back to the previous inode until
3057: * the new inode is committed to disk. If the COMPLETE flag is
3058: * set, then we have deleted an entry that never made it to disk.
3059: * If the entry we deleted resulted from a name change, then the old
3060: * inode reference still resides on disk. Any rollback that we do
3061: * needs to be to that old inode (returned to us in prevdirrem). If
3062: * the entry we deleted resulted from a create, then there is
3063: * no entry on the disk, so we want to roll back to zero rather
3064: * than the uncommitted inode. In either of the COMPLETE cases we
3065: * want to immediately free the unwritten and unreferenced inode.
3066: */
3067: if ((dirrem->dm_state & COMPLETE) == 0) {
3068: dap->da_previous = dirrem;
3069: } else {
3070: if (prevdirrem != NULL) {
3071: dap->da_previous = prevdirrem;
3072: } else {
3073: dap->da_state &= ~DIRCHG;
3074: dap->da_pagedep = pagedep;
3075: }
3076: dirrem->dm_dirinum = pagedep->pd_ino;
3077: add_to_worklist(&dirrem->dm_list);
3078: }
3079: /*
3080: * Link into its inodedep. Put it on the id_bufwait list if the inode
3081: * is not yet written. If it is written, do the post-inode write
3082: * processing to put it on the id_pendinghd list.
3083: */
3084: if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
3085: (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3086: dap->da_state |= COMPLETE;
3087: LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3088: WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3089: } else {
3090: LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3091: dap, da_pdlist);
3092: WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3093: }
3094: FREE_LOCK(&lk);
3095: }
3096:
3097: /*
3098: * Called whenever the link count on an inode is changed.
3099: * It creates an inode dependency so that the new reference(s)
3100: * to the inode cannot be committed to disk until the updated
3101: * inode has been written.
3102: */
3103: void
3104: softdep_change_linkcnt(ip, nodelay)
3105: struct inode *ip; /* the inode with the increased link count */
3106: int nodelay; /* do background work or not */
3107: {
3108: struct inodedep *inodedep;
3109: int flags;
3110:
3111: /*
3112: * If requested, do not allow background work to happen.
3113: */
3114: flags = DEPALLOC;
3115: if (nodelay)
3116: flags |= NODELAY;
3117:
3118: ACQUIRE_LOCK(&lk);
3119:
3120: (void) inodedep_lookup(ip->i_fs, ip->i_number, flags, &inodedep);
3121: if (DIP(ip, nlink) < ip->i_effnlink) {
3122: FREE_LOCK(&lk);
3123: panic("softdep_change_linkcnt: bad delta");
3124: }
3125:
3126: inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3127:
3128: FREE_LOCK(&lk);
3129: }
3130:
3131: /*
3132: * This workitem decrements the inode's link count.
3133: * If the link count reaches zero, the file is removed.
3134: */
3135: STATIC void
3136: handle_workitem_remove(dirrem)
3137: struct dirrem *dirrem;
3138: {
3139: struct proc *p = CURPROC; /* XXX */
3140: struct inodedep *inodedep;
3141: struct vnode *vp;
3142: struct inode *ip;
3143: ino_t oldinum;
3144: int error;
3145:
3146: if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
3147: softdep_error("handle_workitem_remove: vget", error);
3148: return;
3149: }
3150: ip = VTOI(vp);
3151: ACQUIRE_LOCK(&lk);
3152: if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep))
3153: == 0) {
3154: FREE_LOCK(&lk);
3155: panic("handle_workitem_remove: lost inodedep");
3156: }
3157: /*
3158: * Normal file deletion.
3159: */
3160: if ((dirrem->dm_state & RMDIR) == 0) {
3161: DIP_ADD(ip, nlink, -1);
3162: ip->i_flag |= IN_CHANGE;
3163: if (DIP(ip, nlink) < ip->i_effnlink) {
3164: FREE_LOCK(&lk);
3165: panic("handle_workitem_remove: bad file delta");
3166: }
3167: inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3168: FREE_LOCK(&lk);
3169: vput(vp);
3170: num_dirrem -= 1;
3171: WORKITEM_FREE(dirrem, D_DIRREM);
3172: return;
3173: }
3174: /*
3175: * Directory deletion. Decrement reference count for both the
3176: * just deleted parent directory entry and the reference for ".".
3177: * Next truncate the directory to length zero. When the
3178: * truncation completes, arrange to have the reference count on
3179: * the parent decremented to account for the loss of "..".
3180: */
3181: DIP_ADD(ip, nlink, -2);
3182: ip->i_flag |= IN_CHANGE;
3183: if (DIP(ip, nlink) < ip->i_effnlink)
3184: panic("handle_workitem_remove: bad dir delta");
3185: inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3186: FREE_LOCK(&lk);
3187: if ((error = UFS_TRUNCATE(ip, (off_t)0, 0, p->p_ucred)) != 0)
3188: softdep_error("handle_workitem_remove: truncate", error);
3189: /*
3190: * Rename a directory to a new parent. Since, we are both deleting
3191: * and creating a new directory entry, the link count on the new
3192: * directory should not change. Thus we skip the followup dirrem.
3193: */
3194: if (dirrem->dm_state & DIRCHG) {
3195: vput(vp);
3196: num_dirrem -= 1;
3197: WORKITEM_FREE(dirrem, D_DIRREM);
3198: return;
3199: }
3200: /*
3201: * If the inodedep does not exist, then the zero'ed inode has
3202: * been written to disk. If the allocated inode has never been
3203: * written to disk, then the on-disk inode is zero'ed. In either
3204: * case we can remove the file immediately.
3205: */
3206: ACQUIRE_LOCK(&lk);
3207: dirrem->dm_state = 0;
3208: oldinum = dirrem->dm_oldinum;
3209: dirrem->dm_oldinum = dirrem->dm_dirinum;
3210: if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
3211: check_inode_unwritten(inodedep)) {
3212: FREE_LOCK(&lk);
3213: vput(vp);
3214: handle_workitem_remove(dirrem);
3215: return;
3216: }
3217: WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3218: FREE_LOCK(&lk);
3219: ip->i_flag |= IN_CHANGE;
3220: UFS_UPDATE(VTOI(vp), 0);
3221: vput(vp);
3222: }
3223:
3224: /*
3225: * Inode de-allocation dependencies.
3226: *
3227: * When an inode's link count is reduced to zero, it can be de-allocated. We
3228: * found it convenient to postpone de-allocation until after the inode is
3229: * written to disk with its new link count (zero). At this point, all of the
3230: * on-disk inode's block pointers are nullified and, with careful dependency
3231: * list ordering, all dependencies related to the inode will be satisfied and
3232: * the corresponding dependency structures de-allocated. So, if/when the
3233: * inode is reused, there will be no mixing of old dependencies with new
3234: * ones. This artificial dependency is set up by the block de-allocation
3235: * procedure above (softdep_setup_freeblocks) and completed by the
3236: * following procedure.
3237: */
3238: STATIC void
3239: handle_workitem_freefile(freefile)
3240: struct freefile *freefile;
3241: {
3242: struct fs *fs;
3243: struct vnode vp;
3244: struct inode tip;
3245: #ifdef DEBUG
3246: struct inodedep *idp;
3247: #endif
3248: int error;
3249:
3250: fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
3251: #ifdef DEBUG
3252: ACQUIRE_LOCK(&lk);
3253: error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
3254: FREE_LOCK(&lk);
3255: if (error)
3256: panic("handle_workitem_freefile: inodedep survived");
3257: #endif
3258: tip.i_ump = VFSTOUFS(freefile->fx_mnt);
3259: tip.i_dev = freefile->fx_devvp->v_rdev;
3260: tip.i_fs = fs;
3261: tip.i_vnode = &vp;
3262: vp.v_data = &tip;
3263:
3264: if ((error = ffs_freefile(&tip, freefile->fx_oldinum,
3265: freefile->fx_mode)) != 0) {
3266: softdep_error("handle_workitem_freefile", error);
3267: }
3268: WORKITEM_FREE(freefile, D_FREEFILE);
3269: }
3270:
3271: /*
3272: * Disk writes.
3273: *
3274: * The dependency structures constructed above are most actively used when file
3275: * system blocks are written to disk. No constraints are placed on when a
3276: * block can be written, but unsatisfied update dependencies are made safe by
3277: * modifying (or replacing) the source memory for the duration of the disk
3278: * write. When the disk write completes, the memory block is again brought
3279: * up-to-date.
3280: *
3281: * In-core inode structure reclamation.
3282: *
3283: * Because there are a finite number of "in-core" inode structures, they are
3284: * reused regularly. By transferring all inode-related dependencies to the
3285: * in-memory inode block and indexing them separately (via "inodedep"s), we
3286: * can allow "in-core" inode structures to be reused at any time and avoid
3287: * any increase in contention.
3288: *
3289: * Called just before entering the device driver to initiate a new disk I/O.
3290: * The buffer must be locked, thus, no I/O completion operations can occur
3291: * while we are manipulating its associated dependencies.
3292: */
3293: void
3294: softdep_disk_io_initiation(bp)
3295: struct buf *bp; /* structure describing disk write to occur */
3296: {
3297: struct worklist *wk, *nextwk;
3298: struct indirdep *indirdep;
3299: struct inodedep *inodedep;
3300: struct buf *sbp;
3301:
3302: /*
3303: * We only care about write operations. There should never
3304: * be dependencies for reads.
3305: */
3306: if (bp->b_flags & B_READ)
3307: panic("softdep_disk_io_initiation: read");
3308:
3309: ACQUIRE_LOCK(&lk);
3310:
3311: /*
3312: * Do any necessary pre-I/O processing.
3313: */
3314: for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
3315: nextwk = LIST_NEXT(wk, wk_list);
3316: switch (wk->wk_type) {
3317:
3318: case D_PAGEDEP:
3319: initiate_write_filepage(WK_PAGEDEP(wk), bp);
3320: continue;
3321:
3322: case D_INODEDEP:
3323: inodedep = WK_INODEDEP(wk);
3324: if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3325: initiate_write_inodeblock_ufs1(inodedep, bp);
3326: #ifdef FFS2
3327: else
3328: initiate_write_inodeblock_ufs2(inodedep, bp);
3329: #endif
3330: continue;
3331:
3332: case D_INDIRDEP:
3333: indirdep = WK_INDIRDEP(wk);
3334: if (indirdep->ir_state & GOINGAWAY)
3335: panic("disk_io_initiation: indirdep gone");
3336: /*
3337: * If there are no remaining dependencies, this
3338: * will be writing the real pointers, so the
3339: * dependency can be freed.
3340: */
3341: if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3342: sbp = indirdep->ir_savebp;
3343: sbp->b_flags |= B_INVAL | B_NOCACHE;
3344: /* inline expand WORKLIST_REMOVE(wk); */
3345: wk->wk_state &= ~ONWORKLIST;
3346: LIST_REMOVE(wk, wk_list);
3347: WORKITEM_FREE(indirdep, D_INDIRDEP);
3348: FREE_LOCK(&lk);
3349: brelse(sbp);
3350: ACQUIRE_LOCK(&lk);
3351: continue;
3352: }
3353: /*
3354: * Replace up-to-date version with safe version.
3355: */
3356: FREE_LOCK(&lk);
3357: indirdep->ir_saveddata = malloc(bp->b_bcount,
3358: M_INDIRDEP, M_WAITOK);
3359: ACQUIRE_LOCK(&lk);
3360: indirdep->ir_state &= ~ATTACHED;
3361: indirdep->ir_state |= UNDONE;
3362: bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3363: bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3364: bp->b_bcount);
3365: continue;
3366:
3367: case D_MKDIR:
3368: case D_BMSAFEMAP:
3369: case D_ALLOCDIRECT:
3370: case D_ALLOCINDIR:
3371: continue;
3372:
3373: default:
3374: FREE_LOCK(&lk);
3375: panic("handle_disk_io_initiation: Unexpected type %s",
3376: TYPENAME(wk->wk_type));
3377: /* NOTREACHED */
3378: }
3379: }
3380:
3381: FREE_LOCK(&lk);
3382: }
3383:
3384: /*
3385: * Called from within the procedure above to deal with unsatisfied
3386: * allocation dependencies in a directory. The buffer must be locked,
3387: * thus, no I/O completion operations can occur while we are
3388: * manipulating its associated dependencies.
3389: */
3390: STATIC void
3391: initiate_write_filepage(pagedep, bp)
3392: struct pagedep *pagedep;
3393: struct buf *bp;
3394: {
3395: struct diradd *dap;
3396: struct direct *ep;
3397: int i;
3398:
3399: if (pagedep->pd_state & IOSTARTED) {
3400: /*
3401: * This can only happen if there is a driver that does not
3402: * understand chaining. Here biodone will reissue the call
3403: * to strategy for the incomplete buffers.
3404: */
3405: printf("initiate_write_filepage: already started\n");
3406: return;
3407: }
3408: pagedep->pd_state |= IOSTARTED;
3409: for (i = 0; i < DAHASHSZ; i++) {
3410: LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3411: ep = (struct direct *)
3412: ((char *)bp->b_data + dap->da_offset);
3413: if (ep->d_ino != dap->da_newinum) {
3414: FREE_LOCK(&lk);
3415: panic("%s: dir inum %d != new %d",
3416: "initiate_write_filepage",
3417: ep->d_ino, dap->da_newinum);
3418: }
3419: if (dap->da_state & DIRCHG)
3420: ep->d_ino = dap->da_previous->dm_oldinum;
3421: else
3422: ep->d_ino = 0;
3423: dap->da_state &= ~ATTACHED;
3424: dap->da_state |= UNDONE;
3425: }
3426: }
3427: }
3428:
3429: /*
3430: * Called from within the procedure above to deal with unsatisfied
3431: * allocation dependencies in an inodeblock. The buffer must be
3432: * locked, thus, no I/O completion operations can occur while we
3433: * are manipulating its associated dependencies.
3434: */
3435: STATIC void
3436: initiate_write_inodeblock_ufs1(inodedep, bp)
3437: struct inodedep *inodedep;
3438: struct buf *bp; /* The inode block */
3439: {
3440: struct allocdirect *adp, *lastadp;
3441: struct ufs1_dinode *dp;
3442: struct fs *fs;
3443: #ifdef DIAGNOSTIC
3444: daddr64_t prevlbn = 0;
3445: int32_t d1, d2;
3446: #endif
3447: int i, deplist;
3448:
3449: if (inodedep->id_state & IOSTARTED) {
3450: FREE_LOCK(&lk);
3451: panic("initiate_write_inodeblock: already started");
3452: }
3453: inodedep->id_state |= IOSTARTED;
3454: fs = inodedep->id_fs;
3455: dp = (struct ufs1_dinode *)bp->b_data +
3456: ino_to_fsbo(fs, inodedep->id_ino);
3457: /*
3458: * If the bitmap is not yet written, then the allocated
3459: * inode cannot be written to disk.
3460: */
3461: if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3462: if (inodedep->id_savedino1 != NULL) {
3463: FREE_LOCK(&lk);
3464: panic("initiate_write_inodeblock: already doing I/O");
3465: }
3466: FREE_LOCK(&lk);
3467: MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
3468: sizeof(struct ufs1_dinode), M_INODEDEP, M_WAITOK);
3469: ACQUIRE_LOCK(&lk);
3470: *inodedep->id_savedino1 = *dp;
3471: bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3472: return;
3473: }
3474: /*
3475: * If no dependencies, then there is nothing to roll back.
3476: */
3477: inodedep->id_savedsize = dp->di_size;
3478: if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3479: return;
3480: /*
3481: * Set the dependencies to busy.
3482: */
3483: for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3484: adp = TAILQ_NEXT(adp, ad_next)) {
3485: #ifdef DIAGNOSTIC
3486: if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3487: FREE_LOCK(&lk);
3488: panic("softdep_write_inodeblock: lbn order");
3489: }
3490: prevlbn = adp->ad_lbn;
3491: if (adp->ad_lbn < NDADDR &&
3492: (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
3493: FREE_LOCK(&lk);
3494: panic("%s: direct pointer #%ld mismatch %d != %d",
3495: "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
3496: }
3497: if (adp->ad_lbn >= NDADDR &&
3498: (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
3499: (d2 = adp->ad_newblkno)) {
3500: FREE_LOCK(&lk);
3501: panic("%s: indirect pointer #%ld mismatch %d != %d",
3502: "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3503: d1, d2);
3504: }
3505: deplist |= 1 << adp->ad_lbn;
3506: if ((adp->ad_state & ATTACHED) == 0) {
3507: FREE_LOCK(&lk);
3508: panic("softdep_write_inodeblock: Unknown state 0x%x",
3509: adp->ad_state);
3510: }
3511: #endif /* DIAGNOSTIC */
3512: adp->ad_state &= ~ATTACHED;
3513: adp->ad_state |= UNDONE;
3514: }
3515: /*
3516: * The on-disk inode cannot claim to be any larger than the last
3517: * fragment that has been written. Otherwise, the on-disk inode
3518: * might have fragments that were not the last block in the file
3519: * which would corrupt the filesystem.
3520: */
3521: for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3522: lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3523: if (adp->ad_lbn >= NDADDR)
3524: break;
3525: dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3526: /* keep going until hitting a rollback to a frag */
3527: if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3528: continue;
3529: dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3530: for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3531: #ifdef DIAGNOSTIC
3532: if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3533: FREE_LOCK(&lk);
3534: panic("softdep_write_inodeblock: lost dep1");
3535: }
3536: #endif /* DIAGNOSTIC */
3537: dp->di_db[i] = 0;
3538: }
3539: for (i = 0; i < NIADDR; i++) {
3540: #ifdef DIAGNOSTIC
3541: if (dp->di_ib[i] != 0 &&
3542: (deplist & ((1 << NDADDR) << i)) == 0) {
3543: FREE_LOCK(&lk);
3544: panic("softdep_write_inodeblock: lost dep2");
3545: }
3546: #endif /* DIAGNOSTIC */
3547: dp->di_ib[i] = 0;
3548: }
3549: return;
3550: }
3551: /*
3552: * If we have zero'ed out the last allocated block of the file,
3553: * roll back the size to the last currently allocated block.
3554: * We know that this last allocated block is a full-sized as
3555: * we already checked for fragments in the loop above.
3556: */
3557: if (lastadp != NULL &&
3558: dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3559: for (i = lastadp->ad_lbn; i >= 0; i--)
3560: if (dp->di_db[i] != 0)
3561: break;
3562: dp->di_size = (i + 1) * fs->fs_bsize;
3563: }
3564: /*
3565: * The only dependencies are for indirect blocks.
3566: *
3567: * The file size for indirect block additions is not guaranteed.
3568: * Such a guarantee would be non-trivial to achieve. The conventional
3569: * synchronous write implementation also does not make this guarantee.
3570: * Fsck should catch and fix discrepancies. Arguably, the file size
3571: * can be over-estimated without destroying integrity when the file
3572: * moves into the indirect blocks (i.e., is large). If we want to
3573: * postpone fsck, we are stuck with this argument.
3574: */
3575: for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3576: dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3577: }
3578:
3579: #ifdef FFS2
3580: /*
3581: * Version of initiate_write_inodeblock that handles FFS2 dinodes.
3582: */
3583: STATIC void
3584: initiate_write_inodeblock_ufs2(inodedep, bp)
3585: struct inodedep *inodedep;
3586: struct buf *bp; /* The inode block */
3587: {
3588: struct allocdirect *adp, *lastadp;
3589: struct ufs2_dinode *dp;
3590: struct fs *fs = inodedep->id_fs;
3591: #ifdef DIAGNOSTIC
3592: daddr64_t prevlbn = -1, d1, d2;
3593: #endif
3594: int deplist, i;
3595:
3596: if (inodedep->id_state & IOSTARTED)
3597: panic("initiate_write_inodeblock_ufs2: already started");
3598: inodedep->id_state |= IOSTARTED;
3599: fs = inodedep->id_fs;
3600: dp = (struct ufs2_dinode *)bp->b_data +
3601: ino_to_fsbo(fs, inodedep->id_ino);
3602: /*
3603: * If the bitmap is not yet written, then the allocated
3604: * inode cannot be written to disk.
3605: */
3606: if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3607: if (inodedep->id_savedino2 != NULL)
3608: panic("initiate_write_inodeblock_ufs2: I/O underway");
3609: MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
3610: sizeof(struct ufs2_dinode), M_INODEDEP, M_WAITOK);
3611: *inodedep->id_savedino2 = *dp;
3612: bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
3613: return;
3614: }
3615: /*
3616: * If no dependencies, then there is nothing to roll back.
3617: */
3618: inodedep->id_savedsize = dp->di_size;
3619: if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3620: return;
3621:
3622: #ifdef notyet
3623: inodedep->id_savedextsize = dp->di_extsize;
3624: if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
3625: TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
3626: return;
3627: /*
3628: * Set the ext data dependencies to busy.
3629: */
3630: for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3631: adp = TAILQ_NEXT(adp, ad_next)) {
3632: #ifdef DIAGNOSTIC
3633: if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3634: FREE_LOCK(&lk);
3635: panic("softdep_write_inodeblock: lbn order");
3636: }
3637: prevlbn = adp->ad_lbn;
3638: if ((d1 = dp->di_extb[adp->ad_lbn]) !=
3639: (d2 = adp->ad_newblkno)) {
3640: FREE_LOCK(&lk);
3641: panic("%s: direct pointer #%ld mismatch %ld != %ld",
3642: "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
3643: }
3644: deplist |= 1 << adp->ad_lbn;
3645: if ((adp->ad_state & ATTACHED) == 0) {
3646: FREE_LOCK(&lk);
3647: panic("softdep_write_inodeblock: Unknown state 0x%x",
3648: adp->ad_state);
3649: }
3650: #endif /* DIAGNOSTIC */
3651: adp->ad_state &= ~ATTACHED;
3652: adp->ad_state |= UNDONE;
3653: }
3654: /*
3655: * The on-disk inode cannot claim to be any larger than the last
3656: * fragment that has been written. Otherwise, the on-disk inode
3657: * might have fragments that were not the last block in the ext
3658: * data which would corrupt the filesystem.
3659: */
3660: for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3661: lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3662: dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
3663: /* keep going until hitting a rollback to a frag */
3664: if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3665: continue;
3666: dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3667: for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
3668: #ifdef DIAGNOSTIC
3669: if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
3670: FREE_LOCK(&lk);
3671: panic("softdep_write_inodeblock: lost dep1");
3672: }
3673: #endif /* DIAGNOSTIC */
3674: dp->di_extb[i] = 0;
3675: }
3676: lastadp = NULL;
3677: break;
3678: }
3679: /*
3680: * If we have zero'ed out the last allocated block of the ext
3681: * data, roll back the size to the last currently allocated block.
3682: * We know that this last allocated block is a full-sized as
3683: * we already checked for fragments in the loop above.
3684: */
3685: if (lastadp != NULL &&
3686: dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3687: for (i = lastadp->ad_lbn; i >= 0; i--)
3688: if (dp->di_extb[i] != 0)
3689: break;
3690: dp->di_extsize = (i + 1) * fs->fs_bsize;
3691: }
3692: #endif /* notyet */
3693:
3694: /*
3695: * Set the file data dependencies to busy.
3696: */
3697: for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3698: adp = TAILQ_NEXT(adp, ad_next)) {
3699: #ifdef DIAGNOSTIC
3700: if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3701: FREE_LOCK(&lk);
3702: panic("softdep_write_inodeblock: lbn order");
3703: }
3704: prevlbn = adp->ad_lbn;
3705: if (adp->ad_lbn < NDADDR &&
3706: (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
3707: FREE_LOCK(&lk);
3708: panic("%s: direct pointer #%ld mismatch %ld != %ld",
3709: "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
3710: }
3711: if (adp->ad_lbn >= NDADDR &&
3712: (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
3713: (d2 = adp->ad_newblkno)) {
3714: FREE_LOCK(&lk);
3715: panic("%s: indirect pointer #%ld mismatch %ld != %ld",
3716: "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3717: d1, d2);
3718: }
3719: deplist |= 1 << adp->ad_lbn;
3720: if ((adp->ad_state & ATTACHED) == 0) {
3721: FREE_LOCK(&lk);
3722: panic("softdep_write_inodeblock: Unknown state 0x%x",
3723: adp->ad_state);
3724: }
3725: #endif /* DIAGNOSTIC */
3726: adp->ad_state &= ~ATTACHED;
3727: adp->ad_state |= UNDONE;
3728: }
3729: /*
3730: * The on-disk inode cannot claim to be any larger than the last
3731: * fragment that has been written. Otherwise, the on-disk inode
3732: * might have fragments that were not the last block in the file
3733: * which would corrupt the filesystem.
3734: */
3735: for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3736: lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3737: if (adp->ad_lbn >= NDADDR)
3738: break;
3739: dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3740: /* keep going until hitting a rollback to a frag */
3741: if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3742: continue;
3743: dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3744: for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3745: #ifdef DIAGNOSTIC
3746: if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3747: FREE_LOCK(&lk);
3748: panic("softdep_write_inodeblock: lost dep2");
3749: }
3750: #endif /* DIAGNOSTIC */
3751: dp->di_db[i] = 0;
3752: }
3753: for (i = 0; i < NIADDR; i++) {
3754: #ifdef DIAGNOSTIC
3755: if (dp->di_ib[i] != 0 &&
3756: (deplist & ((1 << NDADDR) << i)) == 0) {
3757: FREE_LOCK(&lk);
3758: panic("softdep_write_inodeblock: lost dep3");
3759: }
3760: #endif /* DIAGNOSTIC */
3761: dp->di_ib[i] = 0;
3762: }
3763: return;
3764: }
3765: /*
3766: * If we have zero'ed out the last allocated block of the file,
3767: * roll back the size to the last currently allocated block.
3768: * We know that this last allocated block is a full-sized as
3769: * we already checked for fragments in the loop above.
3770: */
3771: if (lastadp != NULL &&
3772: dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3773: for (i = lastadp->ad_lbn; i >= 0; i--)
3774: if (dp->di_db[i] != 0)
3775: break;
3776: dp->di_size = (i + 1) * fs->fs_bsize;
3777: }
3778: /*
3779: * The only dependencies are for indirect blocks.
3780: *
3781: * The file size for indirect block additions is not guaranteed.
3782: * Such a guarantee would be non-trivial to achieve. The conventional
3783: * synchronous write implementation also does not make this guarantee.
3784: * Fsck should catch and fix discrepancies. Arguably, the file size
3785: * can be over-estimated without destroying integrity when the file
3786: * moves into the indirect blocks (i.e., is large). If we want to
3787: * postpone fsck, we are stuck with this argument.
3788: */
3789: for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3790: dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3791: }
3792: #endif /* FFS2 */
3793:
3794: /*
3795: * This routine is called during the completion interrupt
3796: * service routine for a disk write (from the procedure called
3797: * by the device driver to inform the file system caches of
3798: * a request completion). It should be called early in this
3799: * procedure, before the block is made available to other
3800: * processes or other routines are called.
3801: */
3802: void
3803: softdep_disk_write_complete(bp)
3804: struct buf *bp; /* describes the completed disk write */
3805: {
3806: struct worklist *wk;
3807: struct workhead reattach;
3808: struct newblk *newblk;
3809: struct allocindir *aip;
3810: struct allocdirect *adp;
3811: struct indirdep *indirdep;
3812: struct inodedep *inodedep;
3813: struct bmsafemap *bmsafemap;
3814:
3815: /*
3816: * If an error occurred while doing the write, then the data
3817: * has not hit the disk and the dependencies cannot be unrolled.
3818: */
3819: if ((bp->b_flags & B_ERROR) && !(bp->b_flags & B_INVAL))
3820: return;
3821:
3822: #ifdef DEBUG
3823: if (lk.lkt_held != -1)
3824: panic("softdep_disk_write_complete: lock is held");
3825: lk.lkt_held = -2;
3826: #endif
3827: LIST_INIT(&reattach);
3828: while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3829: WORKLIST_REMOVE(wk);
3830: switch (wk->wk_type) {
3831:
3832: case D_PAGEDEP:
3833: if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3834: WORKLIST_INSERT(&reattach, wk);
3835: continue;
3836:
3837: case D_INODEDEP:
3838: if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3839: WORKLIST_INSERT(&reattach, wk);
3840: continue;
3841:
3842: case D_BMSAFEMAP:
3843: bmsafemap = WK_BMSAFEMAP(wk);
3844: while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3845: newblk->nb_state |= DEPCOMPLETE;
3846: newblk->nb_bmsafemap = NULL;
3847: LIST_REMOVE(newblk, nb_deps);
3848: }
3849: while ((adp =
3850: LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3851: adp->ad_state |= DEPCOMPLETE;
3852: adp->ad_buf = NULL;
3853: LIST_REMOVE(adp, ad_deps);
3854: handle_allocdirect_partdone(adp);
3855: }
3856: while ((aip =
3857: LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3858: aip->ai_state |= DEPCOMPLETE;
3859: aip->ai_buf = NULL;
3860: LIST_REMOVE(aip, ai_deps);
3861: handle_allocindir_partdone(aip);
3862: }
3863: while ((inodedep =
3864: LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3865: inodedep->id_state |= DEPCOMPLETE;
3866: LIST_REMOVE(inodedep, id_deps);
3867: inodedep->id_buf = NULL;
3868: }
3869: WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3870: continue;
3871:
3872: case D_MKDIR:
3873: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3874: continue;
3875:
3876: case D_ALLOCDIRECT:
3877: adp = WK_ALLOCDIRECT(wk);
3878: adp->ad_state |= COMPLETE;
3879: handle_allocdirect_partdone(adp);
3880: continue;
3881:
3882: case D_ALLOCINDIR:
3883: aip = WK_ALLOCINDIR(wk);
3884: aip->ai_state |= COMPLETE;
3885: handle_allocindir_partdone(aip);
3886: continue;
3887:
3888: case D_INDIRDEP:
3889: indirdep = WK_INDIRDEP(wk);
3890: if (indirdep->ir_state & GOINGAWAY)
3891: panic("disk_write_complete: indirdep gone");
3892: bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3893: free(indirdep->ir_saveddata, M_INDIRDEP);
3894: indirdep->ir_saveddata = 0;
3895: indirdep->ir_state &= ~UNDONE;
3896: indirdep->ir_state |= ATTACHED;
3897: while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3898: handle_allocindir_partdone(aip);
3899: if (aip == LIST_FIRST(&indirdep->ir_donehd))
3900: panic("disk_write_complete: not gone");
3901: }
3902: WORKLIST_INSERT(&reattach, wk);
3903: if ((bp->b_flags & B_DELWRI) == 0)
3904: stat_indir_blk_ptrs++;
3905: buf_dirty(bp);
3906: continue;
3907:
3908: default:
3909: panic("handle_disk_write_complete: Unknown type %s",
3910: TYPENAME(wk->wk_type));
3911: /* NOTREACHED */
3912: }
3913: }
3914: /*
3915: * Reattach any requests that must be redone.
3916: */
3917: while ((wk = LIST_FIRST(&reattach)) != NULL) {
3918: WORKLIST_REMOVE(wk);
3919: WORKLIST_INSERT(&bp->b_dep, wk);
3920: }
3921: #ifdef DEBUG
3922: if (lk.lkt_held != -2)
3923: panic("softdep_disk_write_complete: lock lost");
3924: lk.lkt_held = -1;
3925: #endif
3926: }
3927:
3928: /*
3929: * Called from within softdep_disk_write_complete above. Note that
3930: * this routine is always called from interrupt level with further
3931: * splbio interrupts blocked.
3932: */
3933: STATIC void
3934: handle_allocdirect_partdone(adp)
3935: struct allocdirect *adp; /* the completed allocdirect */
3936: {
3937: struct allocdirect *listadp;
3938: struct inodedep *inodedep;
3939: long bsize, delay;
3940:
3941: splassert(IPL_BIO);
3942:
3943: if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3944: return;
3945: if (adp->ad_buf != NULL)
3946: panic("handle_allocdirect_partdone: dangling dep");
3947:
3948: /*
3949: * The on-disk inode cannot claim to be any larger than the last
3950: * fragment that has been written. Otherwise, the on-disk inode
3951: * might have fragments that were not the last block in the file
3952: * which would corrupt the filesystem. Thus, we cannot free any
3953: * allocdirects after one whose ad_oldblkno claims a fragment as
3954: * these blocks must be rolled back to zero before writing the inode.
3955: * We check the currently active set of allocdirects in id_inoupdt.
3956: */
3957: inodedep = adp->ad_inodedep;
3958: bsize = inodedep->id_fs->fs_bsize;
3959: TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3960: /* found our block */
3961: if (listadp == adp)
3962: break;
3963: /* continue if ad_oldlbn is not a fragment */
3964: if (listadp->ad_oldsize == 0 ||
3965: listadp->ad_oldsize == bsize)
3966: continue;
3967: /* hit a fragment */
3968: return;
3969: }
3970: /*
3971: * If we have reached the end of the current list without
3972: * finding the just finished dependency, then it must be
3973: * on the future dependency list. Future dependencies cannot
3974: * be freed until they are moved to the current list.
3975: */
3976: if (listadp == NULL) {
3977: #ifdef DEBUG
3978: TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3979: /* found our block */
3980: if (listadp == adp)
3981: break;
3982: if (listadp == NULL)
3983: panic("handle_allocdirect_partdone: lost dep");
3984: #endif /* DEBUG */
3985: return;
3986: }
3987: /*
3988: * If we have found the just finished dependency, then free
3989: * it along with anything that follows it that is complete.
3990: * If the inode still has a bitmap dependency, then it has
3991: * never been written to disk, hence the on-disk inode cannot
3992: * reference the old fragment so we can free it without delay.
3993: */
3994: delay = (inodedep->id_state & DEPCOMPLETE);
3995: for (; adp; adp = listadp) {
3996: listadp = TAILQ_NEXT(adp, ad_next);
3997: if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3998: return;
3999: free_allocdirect(&inodedep->id_inoupdt, adp, delay);
4000: }
4001: }
4002:
4003: /*
4004: * Called from within softdep_disk_write_complete above. Note that
4005: * this routine is always called from interrupt level with further
4006: * splbio interrupts blocked.
4007: */
4008: STATIC void
4009: handle_allocindir_partdone(aip)
4010: struct allocindir *aip; /* the completed allocindir */
4011: {
4012: struct indirdep *indirdep;
4013:
4014: splassert(IPL_BIO);
4015:
4016: if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4017: return;
4018: if (aip->ai_buf != NULL)
4019: panic("handle_allocindir_partdone: dangling dependency");
4020: indirdep = aip->ai_indirdep;
4021: if (indirdep->ir_state & UNDONE) {
4022: LIST_REMOVE(aip, ai_next);
4023: LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4024: return;
4025: }
4026: if (indirdep->ir_state & UFS1FMT)
4027: ((int32_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4028: aip->ai_newblkno;
4029: else
4030: ((int64_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4031: aip->ai_newblkno;
4032: LIST_REMOVE(aip, ai_next);
4033: if (aip->ai_freefrag != NULL)
4034: add_to_worklist(&aip->ai_freefrag->ff_list);
4035: WORKITEM_FREE(aip, D_ALLOCINDIR);
4036: }
4037:
4038: /*
4039: * Called from within softdep_disk_write_complete above to restore
4040: * in-memory inode block contents to their most up-to-date state. Note
4041: * that this routine is always called from interrupt level with further
4042: * splbio interrupts blocked.
4043: */
4044: STATIC int
4045: handle_written_inodeblock(inodedep, bp)
4046: struct inodedep *inodedep;
4047: struct buf *bp; /* buffer containing the inode block */
4048: {
4049: struct worklist *wk, *filefree;
4050: struct allocdirect *adp, *nextadp;
4051: struct ufs1_dinode *dp1 = NULL;
4052: struct ufs2_dinode *dp2 = NULL;
4053: int hadchanges, fstype;
4054:
4055: splassert(IPL_BIO);
4056:
4057: if ((inodedep->id_state & IOSTARTED) == 0)
4058: panic("handle_written_inodeblock: not started");
4059: inodedep->id_state &= ~IOSTARTED;
4060:
4061: if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4062: fstype = UM_UFS1;
4063: dp1 = (struct ufs1_dinode *) bp->b_data +
4064: ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4065: } else {
4066: fstype = UM_UFS2;
4067: dp2 = (struct ufs2_dinode *) bp->b_data +
4068: ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4069: }
4070:
4071: /*
4072: * If we had to rollback the inode allocation because of
4073: * bitmaps being incomplete, then simply restore it.
4074: * Keep the block dirty so that it will not be reclaimed until
4075: * all associated dependencies have been cleared and the
4076: * corresponding updates written to disk.
4077: */
4078: if (inodedep->id_savedino1 != NULL) {
4079: if (fstype == UM_UFS1)
4080: *dp1 = *inodedep->id_savedino1;
4081: else
4082: *dp2 = *inodedep->id_savedino2;
4083: FREE(inodedep->id_savedino1, M_INODEDEP);
4084: inodedep->id_savedino1 = NULL;
4085: if ((bp->b_flags & B_DELWRI) == 0)
4086: stat_inode_bitmap++;
4087: buf_dirty(bp);
4088: return (1);
4089: }
4090: inodedep->id_state |= COMPLETE;
4091: /*
4092: * Roll forward anything that had to be rolled back before
4093: * the inode could be updated.
4094: */
4095: hadchanges = 0;
4096: for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4097: nextadp = TAILQ_NEXT(adp, ad_next);
4098: if (adp->ad_state & ATTACHED)
4099: panic("handle_written_inodeblock: new entry");
4100: if (fstype == UM_UFS1) {
4101: if (adp->ad_lbn < NDADDR) {
4102: if (dp1->di_db[adp->ad_lbn] != adp->ad_oldblkno)
4103: panic("%s: %s #%ld mismatch %d != %d",
4104: "handle_written_inodeblock",
4105: "direct pointer", adp->ad_lbn,
4106: dp1->di_db[adp->ad_lbn],
4107: adp->ad_oldblkno);
4108: dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4109: } else {
4110: if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4111: panic("%s: %s #%ld allocated as %d",
4112: "handle_written_inodeblock",
4113: "indirect pointer",
4114: adp->ad_lbn - NDADDR,
4115: dp1->di_ib[adp->ad_lbn - NDADDR]);
4116: dp1->di_ib[adp->ad_lbn - NDADDR] =
4117: adp->ad_newblkno;
4118: }
4119: } else {
4120: if (adp->ad_lbn < NDADDR) {
4121: if (dp2->di_db[adp->ad_lbn] != adp->ad_oldblkno)
4122: panic("%s: %s #%ld mismatch %d != %d",
4123: "handle_written_inodeblock",
4124: "direct pointer", adp->ad_lbn,
4125: dp2->di_db[adp->ad_lbn],
4126: adp->ad_oldblkno);
4127: dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4128: } else {
4129: if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4130: panic("%s: %s #%ld allocated as %d",
4131: "handle_written_inodeblock",
4132: "indirect pointer",
4133: adp->ad_lbn - NDADDR,
4134: dp2->di_ib[adp->ad_lbn - NDADDR]);
4135: dp2->di_ib[adp->ad_lbn - NDADDR] =
4136: adp->ad_newblkno;
4137: }
4138: }
4139: adp->ad_state &= ~UNDONE;
4140: adp->ad_state |= ATTACHED;
4141: hadchanges = 1;
4142: }
4143: if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4144: stat_direct_blk_ptrs++;
4145: /*
4146: * Reset the file size to its most up-to-date value.
4147: */
4148: if (inodedep->id_savedsize == -1)
4149: panic("handle_written_inodeblock: bad size");
4150:
4151: if (fstype == UM_UFS1) {
4152: if (dp1->di_size != inodedep->id_savedsize) {
4153: dp1->di_size = inodedep->id_savedsize;
4154: hadchanges = 1;
4155: }
4156: } else {
4157: if (dp2->di_size != inodedep->id_savedsize) {
4158: dp2->di_size = inodedep->id_savedsize;
4159: hadchanges = 1;
4160: }
4161: }
4162: inodedep->id_savedsize = -1;
4163: /*
4164: * If there were any rollbacks in the inode block, then it must be
4165: * marked dirty so that its will eventually get written back in
4166: * its correct form.
4167: */
4168: if (hadchanges)
4169: buf_dirty(bp);
4170: /*
4171: * Process any allocdirects that completed during the update.
4172: */
4173: if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4174: handle_allocdirect_partdone(adp);
4175: /*
4176: * Process deallocations that were held pending until the
4177: * inode had been written to disk. Freeing of the inode
4178: * is delayed until after all blocks have been freed to
4179: * avoid creation of new <vfsid, inum, lbn> triples
4180: * before the old ones have been deleted.
4181: */
4182: filefree = NULL;
4183: while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4184: WORKLIST_REMOVE(wk);
4185: switch (wk->wk_type) {
4186:
4187: case D_FREEFILE:
4188: /*
4189: * We defer adding filefree to the worklist until
4190: * all other additions have been made to ensure
4191: * that it will be done after all the old blocks
4192: * have been freed.
4193: */
4194: if (filefree != NULL)
4195: panic("handle_written_inodeblock: filefree");
4196: filefree = wk;
4197: continue;
4198:
4199: case D_MKDIR:
4200: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4201: continue;
4202:
4203: case D_DIRADD:
4204: diradd_inode_written(WK_DIRADD(wk), inodedep);
4205: continue;
4206:
4207: case D_FREEBLKS:
4208: wk->wk_state |= COMPLETE;
4209: if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
4210: continue;
4211: /* FALLTHROUGH */
4212: case D_FREEFRAG:
4213: case D_DIRREM:
4214: add_to_worklist(wk);
4215: continue;
4216:
4217: case D_NEWDIRBLK:
4218: free_newdirblk(WK_NEWDIRBLK(wk));
4219: continue;
4220:
4221: default:
4222: panic("handle_written_inodeblock: Unknown type %s",
4223: TYPENAME(wk->wk_type));
4224: /* NOTREACHED */
4225: }
4226: }
4227: if (filefree != NULL) {
4228: if (free_inodedep(inodedep) == 0)
4229: panic("handle_written_inodeblock: live inodedep");
4230: add_to_worklist(filefree);
4231: return (0);
4232: }
4233:
4234: /*
4235: * If no outstanding dependencies, free it.
4236: */
4237: if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
4238: return (0);
4239: return (hadchanges);
4240: }
4241:
4242: /*
4243: * Process a diradd entry after its dependent inode has been written.
4244: * This routine must be called with splbio interrupts blocked.
4245: */
4246: STATIC void
4247: diradd_inode_written(dap, inodedep)
4248: struct diradd *dap;
4249: struct inodedep *inodedep;
4250: {
4251: struct pagedep *pagedep;
4252:
4253: splassert(IPL_BIO);
4254:
4255: dap->da_state |= COMPLETE;
4256: if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4257: if (dap->da_state & DIRCHG)
4258: pagedep = dap->da_previous->dm_pagedep;
4259: else
4260: pagedep = dap->da_pagedep;
4261: LIST_REMOVE(dap, da_pdlist);
4262: LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4263: }
4264: WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4265: }
4266:
4267: /*
4268: * Handle the completion of a mkdir dependency.
4269: */
4270: STATIC void
4271: handle_written_mkdir(mkdir, type)
4272: struct mkdir *mkdir;
4273: int type;
4274: {
4275: struct diradd *dap;
4276: struct pagedep *pagedep;
4277:
4278: splassert(IPL_BIO);
4279:
4280: if (mkdir->md_state != type)
4281: panic("handle_written_mkdir: bad type");
4282: dap = mkdir->md_diradd;
4283: dap->da_state &= ~type;
4284: if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4285: dap->da_state |= DEPCOMPLETE;
4286: if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4287: if (dap->da_state & DIRCHG)
4288: pagedep = dap->da_previous->dm_pagedep;
4289: else
4290: pagedep = dap->da_pagedep;
4291: LIST_REMOVE(dap, da_pdlist);
4292: LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4293: }
4294: LIST_REMOVE(mkdir, md_mkdirs);
4295: WORKITEM_FREE(mkdir, D_MKDIR);
4296: }
4297:
4298: /*
4299: * Called from within softdep_disk_write_complete above.
4300: * A write operation was just completed. Removed inodes can
4301: * now be freed and associated block pointers may be committed.
4302: * Note that this routine is always called from interrupt level
4303: * with further splbio interrupts blocked.
4304: */
4305: STATIC int
4306: handle_written_filepage(pagedep, bp)
4307: struct pagedep *pagedep;
4308: struct buf *bp; /* buffer containing the written page */
4309: {
4310: struct dirrem *dirrem;
4311: struct diradd *dap, *nextdap;
4312: struct direct *ep;
4313: int i, chgs;
4314:
4315: splassert(IPL_BIO);
4316:
4317: if ((pagedep->pd_state & IOSTARTED) == 0)
4318: panic("handle_written_filepage: not started");
4319: pagedep->pd_state &= ~IOSTARTED;
4320: /*
4321: * Process any directory removals that have been committed.
4322: */
4323: while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4324: LIST_REMOVE(dirrem, dm_next);
4325: dirrem->dm_dirinum = pagedep->pd_ino;
4326: add_to_worklist(&dirrem->dm_list);
4327: }
4328: /*
4329: * Free any directory additions that have been committed.
4330: * If it is a newly allocated block, we have to wait until
4331: * the on-disk directory inode claims the new block.
4332: */
4333: if ((pagedep->pd_state & NEWBLOCK) == 0)
4334: while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4335: free_diradd(dap);
4336: /*
4337: * Uncommitted directory entries must be restored.
4338: */
4339: for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4340: for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4341: dap = nextdap) {
4342: nextdap = LIST_NEXT(dap, da_pdlist);
4343: if (dap->da_state & ATTACHED)
4344: panic("handle_written_filepage: attached");
4345: ep = (struct direct *)
4346: ((char *)bp->b_data + dap->da_offset);
4347: ep->d_ino = dap->da_newinum;
4348: dap->da_state &= ~UNDONE;
4349: dap->da_state |= ATTACHED;
4350: chgs = 1;
4351: /*
4352: * If the inode referenced by the directory has
4353: * been written out, then the dependency can be
4354: * moved to the pending list.
4355: */
4356: if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4357: LIST_REMOVE(dap, da_pdlist);
4358: LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4359: da_pdlist);
4360: }
4361: }
4362: }
4363: /*
4364: * If there were any rollbacks in the directory, then it must be
4365: * marked dirty so that its will eventually get written back in
4366: * its correct form.
4367: */
4368: if (chgs) {
4369: if ((bp->b_flags & B_DELWRI) == 0)
4370: stat_dir_entry++;
4371: buf_dirty(bp);
4372: return (1);
4373: }
4374: /*
4375: * If we are not waiting for a new directory block to be
4376: * claimed by its inode, then the pagedep will be freed.
4377: * Otherwise it will remain to track any new entries on
4378: * the page in case they are fsync'ed.
4379: */
4380: if ((pagedep->pd_state & NEWBLOCK) == 0) {
4381: LIST_REMOVE(pagedep, pd_hash);
4382: WORKITEM_FREE(pagedep, D_PAGEDEP);
4383: }
4384: return (0);
4385: }
4386:
4387: /*
4388: * Writing back in-core inode structures.
4389: *
4390: * The file system only accesses an inode's contents when it occupies an
4391: * "in-core" inode structure. These "in-core" structures are separate from
4392: * the page frames used to cache inode blocks. Only the latter are
4393: * transferred to/from the disk. So, when the updated contents of the
4394: * "in-core" inode structure are copied to the corresponding in-memory inode
4395: * block, the dependencies are also transferred. The following procedure is
4396: * called when copying a dirty "in-core" inode to a cached inode block.
4397: */
4398:
4399: /*
4400: * Called when an inode is loaded from disk. If the effective link count
4401: * differed from the actual link count when it was last flushed, then we
4402: * need to ensure that the correct effective link count is put back.
4403: */
4404: void
4405: softdep_load_inodeblock(ip)
4406: struct inode *ip; /* the "in_core" copy of the inode */
4407: {
4408: struct inodedep *inodedep;
4409:
4410: /*
4411: * Check for alternate nlink count.
4412: */
4413: ip->i_effnlink = DIP(ip, nlink);
4414: ACQUIRE_LOCK(&lk);
4415: if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4416: FREE_LOCK(&lk);
4417: return;
4418: }
4419: ip->i_effnlink -= inodedep->id_nlinkdelta;
4420: FREE_LOCK(&lk);
4421: }
4422:
4423: /*
4424: * This routine is called just before the "in-core" inode
4425: * information is to be copied to the in-memory inode block.
4426: * Recall that an inode block contains several inodes. If
4427: * the force flag is set, then the dependencies will be
4428: * cleared so that the update can always be made. Note that
4429: * the buffer is locked when this routine is called, so we
4430: * will never be in the middle of writing the inode block
4431: * to disk.
4432: */
4433: void
4434: softdep_update_inodeblock(ip, bp, waitfor)
4435: struct inode *ip; /* the "in_core" copy of the inode */
4436: struct buf *bp; /* the buffer containing the inode block */
4437: int waitfor; /* nonzero => update must be allowed */
4438: {
4439: struct inodedep *inodedep;
4440: struct worklist *wk;
4441: int error, gotit;
4442:
4443: /*
4444: * If the effective link count is not equal to the actual link
4445: * count, then we must track the difference in an inodedep while
4446: * the inode is (potentially) tossed out of the cache. Otherwise,
4447: * if there is no existing inodedep, then there are no dependencies
4448: * to track.
4449: */
4450: ACQUIRE_LOCK(&lk);
4451: if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4452: FREE_LOCK(&lk);
4453: if (ip->i_effnlink != DIP(ip, nlink))
4454: panic("softdep_update_inodeblock: bad link count");
4455: return;
4456: }
4457: if (inodedep->id_nlinkdelta != DIP(ip, nlink) - ip->i_effnlink) {
4458: FREE_LOCK(&lk);
4459: panic("softdep_update_inodeblock: bad delta");
4460: }
4461: /*
4462: * Changes have been initiated. Anything depending on these
4463: * changes cannot occur until this inode has been written.
4464: */
4465: inodedep->id_state &= ~COMPLETE;
4466: if ((inodedep->id_state & ONWORKLIST) == 0)
4467: WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4468: /*
4469: * Any new dependencies associated with the incore inode must
4470: * now be moved to the list associated with the buffer holding
4471: * the in-memory copy of the inode. Once merged process any
4472: * allocdirects that are completed by the merger.
4473: */
4474: merge_inode_lists(inodedep);
4475: if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
4476: handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4477: /*
4478: * Now that the inode has been pushed into the buffer, the
4479: * operations dependent on the inode being written to disk
4480: * can be moved to the id_bufwait so that they will be
4481: * processed when the buffer I/O completes.
4482: */
4483: while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4484: WORKLIST_REMOVE(wk);
4485: WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4486: }
4487: /*
4488: * Newly allocated inodes cannot be written until the bitmap
4489: * that allocates them have been written (indicated by
4490: * DEPCOMPLETE being set in id_state). If we are doing a
4491: * forced sync (e.g., an fsync on a file), we force the bitmap
4492: * to be written so that the update can be done.
4493: */
4494: if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
4495: FREE_LOCK(&lk);
4496: return;
4497: }
4498: bp = inodedep->id_buf;
4499: gotit = getdirtybuf(bp, MNT_WAIT);
4500: FREE_LOCK(&lk);
4501: if (gotit && (error = bwrite(bp)) != 0)
4502: softdep_error("softdep_update_inodeblock: bwrite", error);
4503: if ((inodedep->id_state & DEPCOMPLETE) == 0)
4504: panic("softdep_update_inodeblock: update failed");
4505: }
4506:
4507: /*
4508: * Merge the new inode dependency list (id_newinoupdt) into the old
4509: * inode dependency list (id_inoupdt). This routine must be called
4510: * with splbio interrupts blocked.
4511: */
4512: STATIC void
4513: merge_inode_lists(inodedep)
4514: struct inodedep *inodedep;
4515: {
4516: struct allocdirect *listadp, *newadp;
4517:
4518: splassert(IPL_BIO);
4519:
4520: newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4521: for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
4522: if (listadp->ad_lbn < newadp->ad_lbn) {
4523: listadp = TAILQ_NEXT(listadp, ad_next);
4524: continue;
4525: }
4526: TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4527: TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4528: if (listadp->ad_lbn == newadp->ad_lbn) {
4529: allocdirect_merge(&inodedep->id_inoupdt, newadp,
4530: listadp);
4531: listadp = newadp;
4532: }
4533: newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4534: }
4535: while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
4536: TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4537: TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
4538: }
4539: }
4540:
4541: /*
4542: * If we are doing an fsync, then we must ensure that any directory
4543: * entries for the inode have been written after the inode gets to disk.
4544: */
4545: int
4546: softdep_fsync(vp)
4547: struct vnode *vp; /* the "in_core" copy of the inode */
4548: {
4549: struct inodedep *inodedep;
4550: struct pagedep *pagedep;
4551: struct worklist *wk;
4552: struct diradd *dap;
4553: struct mount *mnt;
4554: struct vnode *pvp;
4555: struct inode *ip;
4556: struct inode *pip;
4557: struct buf *bp;
4558: struct fs *fs;
4559: struct proc *p = CURPROC; /* XXX */
4560: int error, flushparent;
4561: ino_t parentino;
4562: daddr64_t lbn;
4563:
4564: ip = VTOI(vp);
4565: fs = ip->i_fs;
4566: ACQUIRE_LOCK(&lk);
4567: if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4568: FREE_LOCK(&lk);
4569: return (0);
4570: }
4571: if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4572: LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4573: TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4574: TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
4575: FREE_LOCK(&lk);
4576: panic("softdep_fsync: pending ops");
4577: }
4578: for (error = 0, flushparent = 0; ; ) {
4579: if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4580: break;
4581: if (wk->wk_type != D_DIRADD) {
4582: FREE_LOCK(&lk);
4583: panic("softdep_fsync: Unexpected type %s",
4584: TYPENAME(wk->wk_type));
4585: }
4586: dap = WK_DIRADD(wk);
4587: /*
4588: * Flush our parent if this directory entry has a MKDIR_PARENT
4589: * dependency or is contained in a newly allocated block.
4590: */
4591: if (dap->da_state & DIRCHG)
4592: pagedep = dap->da_previous->dm_pagedep;
4593: else
4594: pagedep = dap->da_pagedep;
4595: mnt = pagedep->pd_mnt;
4596: parentino = pagedep->pd_ino;
4597: lbn = pagedep->pd_lbn;
4598: if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4599: FREE_LOCK(&lk);
4600: panic("softdep_fsync: dirty");
4601: }
4602: if ((dap->da_state & MKDIR_PARENT) ||
4603: (pagedep->pd_state & NEWBLOCK))
4604: flushparent = 1;
4605: else
4606: flushparent = 0;
4607: /*
4608: * If we are being fsync'ed as part of vgone'ing this vnode,
4609: * then we will not be able to release and recover the
4610: * vnode below, so we just have to give up on writing its
4611: * directory entry out. It will eventually be written, just
4612: * not now, but then the user was not asking to have it
4613: * written, so we are not breaking any promises.
4614: */
4615: if (vp->v_flag & VXLOCK)
4616: break;
4617: /*
4618: * We prevent deadlock by always fetching inodes from the
4619: * root, moving down the directory tree. Thus, when fetching
4620: * our parent directory, we must unlock ourselves before
4621: * requesting the lock on our parent. See the comment in
4622: * ufs_lookup for details on possible races.
4623: */
4624: FREE_LOCK(&lk);
4625: VOP_UNLOCK(vp, 0, p);
4626: error = VFS_VGET(mnt, parentino, &pvp);
4627: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
4628: if (error != 0)
4629: return (error);
4630: /*
4631: * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
4632: * that are contained in direct blocks will be resolved by
4633: * doing a UFS_UPDATE. Pagedeps contained in indirect blocks
4634: * may require a complete sync'ing of the directory. So, we
4635: * try the cheap and fast UFS_UPDATE first, and if that fails,
4636: * then we do the slower VOP_FSYNC of the directory.
4637: */
4638: pip = VTOI(pvp);
4639: if (flushparent) {
4640: error = UFS_UPDATE(pip, MNT_WAIT);
4641: if (error) {
4642: vput(pvp);
4643: return (error);
4644: }
4645: if (pagedep->pd_state & NEWBLOCK) {
4646: error = VOP_FSYNC(pvp, p->p_ucred, MNT_WAIT, p);
4647: if (error) {
4648: vput(pvp);
4649: return (error);
4650: }
4651: }
4652: }
4653: /*
4654: * Flush directory page containing the inode's name.
4655: */
4656: error = bread(pvp, lbn, fs->fs_bsize, p->p_ucred, &bp);
4657: if (error == 0) {
4658: bp->b_bcount = blksize(fs, pip, lbn);
4659: error = bwrite(bp);
4660: } else
4661: brelse(bp);
4662: vput(pvp);
4663: if (error != 0)
4664: return (error);
4665: ACQUIRE_LOCK(&lk);
4666: if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4667: break;
4668: }
4669: FREE_LOCK(&lk);
4670: return (0);
4671: }
4672:
4673: /*
4674: * Flush all the dirty bitmaps associated with the block device
4675: * before flushing the rest of the dirty blocks so as to reduce
4676: * the number of dependencies that will have to be rolled back.
4677: */
4678: void
4679: softdep_fsync_mountdev(vp, waitfor)
4680: struct vnode *vp;
4681: int waitfor;
4682: {
4683: struct buf *bp, *nbp;
4684: struct worklist *wk;
4685:
4686: if (!vn_isdisk(vp, NULL))
4687: panic("softdep_fsync_mountdev: vnode not a disk");
4688: ACQUIRE_LOCK(&lk);
4689: for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4690: nbp = LIST_NEXT(bp, b_vnbufs);
4691: /*
4692: * If it is already scheduled, skip to the next buffer.
4693: */
4694: if (bp->b_flags & B_BUSY)
4695: continue;
4696: bp->b_flags |= B_BUSY;
4697:
4698: if ((bp->b_flags & B_DELWRI) == 0) {
4699: FREE_LOCK(&lk);
4700: panic("softdep_fsync_mountdev: not dirty");
4701: }
4702: /*
4703: * We are only interested in bitmaps with outstanding
4704: * dependencies.
4705: */
4706: if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4707: wk->wk_type != D_BMSAFEMAP) {
4708: bp->b_flags &= ~B_BUSY;
4709: continue;
4710: }
4711: bremfree(bp);
4712: FREE_LOCK(&lk);
4713: (void) bawrite(bp);
4714: ACQUIRE_LOCK(&lk);
4715: /*
4716: * Since we may have slept during the I/O, we need
4717: * to start from a known point.
4718: */
4719: nbp = LIST_FIRST(&vp->v_dirtyblkhd);
4720: }
4721: if (waitfor == MNT_WAIT)
4722: drain_output(vp, 1);
4723: FREE_LOCK(&lk);
4724: }
4725:
4726: /*
4727: * This routine is called when we are trying to synchronously flush a
4728: * file. This routine must eliminate any filesystem metadata dependencies
4729: * so that the syncing routine can succeed by pushing the dirty blocks
4730: * associated with the file. If any I/O errors occur, they are returned.
4731: */
4732: int
4733: softdep_sync_metadata(ap)
4734: struct vop_fsync_args /* {
4735: struct vnode *a_vp;
4736: struct ucred *a_cred;
4737: int a_waitfor;
4738: struct proc *a_p;
4739: } */ *ap;
4740: {
4741: struct vnode *vp = ap->a_vp;
4742: struct pagedep *pagedep;
4743: struct allocdirect *adp;
4744: struct allocindir *aip;
4745: struct buf *bp, *nbp;
4746: struct worklist *wk;
4747: int i, error, waitfor;
4748:
4749: /*
4750: * Check whether this vnode is involved in a filesystem
4751: * that is doing soft dependency processing.
4752: */
4753: if (!vn_isdisk(vp, NULL)) {
4754: if (!DOINGSOFTDEP(vp))
4755: return (0);
4756: } else
4757: if (vp->v_specmountpoint == NULL ||
4758: (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4759: return (0);
4760: /*
4761: * Ensure that any direct block dependencies have been cleared.
4762: */
4763: ACQUIRE_LOCK(&lk);
4764: if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4765: FREE_LOCK(&lk);
4766: return (error);
4767: }
4768: /*
4769: * For most files, the only metadata dependencies are the
4770: * cylinder group maps that allocate their inode or blocks.
4771: * The block allocation dependencies can be found by traversing
4772: * the dependency lists for any buffers that remain on their
4773: * dirty buffer list. The inode allocation dependency will
4774: * be resolved when the inode is updated with MNT_WAIT.
4775: * This work is done in two passes. The first pass grabs most
4776: * of the buffers and begins asynchronously writing them. The
4777: * only way to wait for these asynchronous writes is to sleep
4778: * on the filesystem vnode which may stay busy for a long time
4779: * if the filesystem is active. So, instead, we make a second
4780: * pass over the dependencies blocking on each write. In the
4781: * usual case we will be blocking against a write that we
4782: * initiated, so when it is done the dependency will have been
4783: * resolved. Thus the second pass is expected to end quickly.
4784: */
4785: waitfor = MNT_NOWAIT;
4786: top:
4787: /*
4788: * We must wait for any I/O in progress to finish so that
4789: * all potential buffers on the dirty list will be visible.
4790: */
4791: drain_output(vp, 1);
4792: bp = LIST_FIRST(&vp->v_dirtyblkhd);
4793: if (getdirtybuf(bp, MNT_WAIT) == 0) {
4794: FREE_LOCK(&lk);
4795: return (0);
4796: }
4797: loop:
4798: /*
4799: * As we hold the buffer locked, none of its dependencies
4800: * will disappear.
4801: */
4802: LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4803: switch (wk->wk_type) {
4804:
4805: case D_ALLOCDIRECT:
4806: adp = WK_ALLOCDIRECT(wk);
4807: if (adp->ad_state & DEPCOMPLETE)
4808: break;
4809: nbp = adp->ad_buf;
4810: if (getdirtybuf(nbp, waitfor) == 0)
4811: break;
4812: FREE_LOCK(&lk);
4813: if (waitfor == MNT_NOWAIT) {
4814: bawrite(nbp);
4815: } else if ((error = VOP_BWRITE(nbp)) != 0) {
4816: bawrite(bp);
4817: return (error);
4818: }
4819: ACQUIRE_LOCK(&lk);
4820: break;
4821:
4822: case D_ALLOCINDIR:
4823: aip = WK_ALLOCINDIR(wk);
4824: if (aip->ai_state & DEPCOMPLETE)
4825: break;
4826: nbp = aip->ai_buf;
4827: if (getdirtybuf(nbp, waitfor) == 0)
4828: break;
4829: FREE_LOCK(&lk);
4830: if (waitfor == MNT_NOWAIT) {
4831: bawrite(nbp);
4832: } else if ((error = VOP_BWRITE(nbp)) != 0) {
4833: bawrite(bp);
4834: return (error);
4835: }
4836: ACQUIRE_LOCK(&lk);
4837: break;
4838:
4839: case D_INDIRDEP:
4840: restart:
4841:
4842: LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4843: if (aip->ai_state & DEPCOMPLETE)
4844: continue;
4845: nbp = aip->ai_buf;
4846: if (getdirtybuf(nbp, MNT_WAIT) == 0)
4847: goto restart;
4848: FREE_LOCK(&lk);
4849: if ((error = VOP_BWRITE(nbp)) != 0) {
4850: bawrite(bp);
4851: return (error);
4852: }
4853: ACQUIRE_LOCK(&lk);
4854: goto restart;
4855: }
4856: break;
4857:
4858: case D_INODEDEP:
4859: if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4860: WK_INODEDEP(wk)->id_ino)) != 0) {
4861: FREE_LOCK(&lk);
4862: bawrite(bp);
4863: return (error);
4864: }
4865: break;
4866:
4867: case D_PAGEDEP:
4868: /*
4869: * We are trying to sync a directory that may
4870: * have dependencies on both its own metadata
4871: * and/or dependencies on the inodes of any
4872: * recently allocated files. We walk its diradd
4873: * lists pushing out the associated inode.
4874: */
4875: pagedep = WK_PAGEDEP(wk);
4876: for (i = 0; i < DAHASHSZ; i++) {
4877: if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
4878: continue;
4879: if ((error =
4880: flush_pagedep_deps(vp, pagedep->pd_mnt,
4881: &pagedep->pd_diraddhd[i]))) {
4882: FREE_LOCK(&lk);
4883: bawrite(bp);
4884: return (error);
4885: }
4886: }
4887: break;
4888:
4889: case D_MKDIR:
4890: /*
4891: * This case should never happen if the vnode has
4892: * been properly sync'ed. However, if this function
4893: * is used at a place where the vnode has not yet
4894: * been sync'ed, this dependency can show up. So,
4895: * rather than panic, just flush it.
4896: */
4897: nbp = WK_MKDIR(wk)->md_buf;
4898: if (getdirtybuf(nbp, waitfor) == 0)
4899: break;
4900: FREE_LOCK(&lk);
4901: if (waitfor == MNT_NOWAIT) {
4902: bawrite(nbp);
4903: } else if ((error = VOP_BWRITE(nbp)) != 0) {
4904: bawrite(bp);
4905: return (error);
4906: }
4907: ACQUIRE_LOCK(&lk);
4908: break;
4909:
4910: case D_BMSAFEMAP:
4911: /*
4912: * This case should never happen if the vnode has
4913: * been properly sync'ed. However, if this function
4914: * is used at a place where the vnode has not yet
4915: * been sync'ed, this dependency can show up. So,
4916: * rather than panic, just flush it.
4917: */
4918: nbp = WK_BMSAFEMAP(wk)->sm_buf;
4919: if (getdirtybuf(nbp, waitfor) == 0)
4920: break;
4921: FREE_LOCK(&lk);
4922: if (waitfor == MNT_NOWAIT) {
4923: bawrite(nbp);
4924: } else if ((error = VOP_BWRITE(nbp)) != 0) {
4925: bawrite(bp);
4926: return (error);
4927: }
4928: ACQUIRE_LOCK(&lk);
4929: break;
4930:
4931: default:
4932: FREE_LOCK(&lk);
4933: panic("softdep_sync_metadata: Unknown type %s",
4934: TYPENAME(wk->wk_type));
4935: /* NOTREACHED */
4936: }
4937: }
4938: nbp = LIST_NEXT(bp, b_vnbufs);
4939: getdirtybuf(nbp, MNT_WAIT);
4940: FREE_LOCK(&lk);
4941: bawrite(bp);
4942: ACQUIRE_LOCK(&lk);
4943: if (nbp != NULL) {
4944: bp = nbp;
4945: goto loop;
4946: }
4947: /*
4948: * The brief unlock is to allow any pent up dependency
4949: * processing to be done. Then proceed with the second pass.
4950: */
4951: if (waitfor == MNT_NOWAIT) {
4952: waitfor = MNT_WAIT;
4953: FREE_LOCK(&lk);
4954: ACQUIRE_LOCK(&lk);
4955: goto top;
4956: }
4957:
4958: /*
4959: * If we have managed to get rid of all the dirty buffers,
4960: * then we are done. For certain directories and block
4961: * devices, we may need to do further work.
4962: *
4963: * We must wait for any I/O in progress to finish so that
4964: * all potential buffers on the dirty list will be visible.
4965: */
4966: drain_output(vp, 1);
4967: if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
4968: FREE_LOCK(&lk);
4969: return (0);
4970: }
4971:
4972: FREE_LOCK(&lk);
4973: /*
4974: * If we are trying to sync a block device, some of its buffers may
4975: * contain metadata that cannot be written until the contents of some
4976: * partially written files have been written to disk. The only easy
4977: * way to accomplish this is to sync the entire filesystem (luckily
4978: * this happens rarely).
4979: */
4980: if (vn_isdisk(vp, NULL) &&
4981: vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
4982: (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4983: ap->a_p)) != 0)
4984: return (error);
4985: return (0);
4986: }
4987:
4988: /*
4989: * Flush the dependencies associated with an inodedep.
4990: * Called with splbio blocked.
4991: */
4992: STATIC int
4993: flush_inodedep_deps(fs, ino)
4994: struct fs *fs;
4995: ino_t ino;
4996: {
4997: struct inodedep *inodedep;
4998: struct allocdirect *adp;
4999: int error, waitfor;
5000: struct buf *bp;
5001:
5002: splassert(IPL_BIO);
5003:
5004: /*
5005: * This work is done in two passes. The first pass grabs most
5006: * of the buffers and begins asynchronously writing them. The
5007: * only way to wait for these asynchronous writes is to sleep
5008: * on the filesystem vnode which may stay busy for a long time
5009: * if the filesystem is active. So, instead, we make a second
5010: * pass over the dependencies blocking on each write. In the
5011: * usual case we will be blocking against a write that we
5012: * initiated, so when it is done the dependency will have been
5013: * resolved. Thus the second pass is expected to end quickly.
5014: * We give a brief window at the top of the loop to allow
5015: * any pending I/O to complete.
5016: */
5017: for (waitfor = MNT_NOWAIT; ; ) {
5018: FREE_LOCK(&lk);
5019: ACQUIRE_LOCK(&lk);
5020: if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5021: return (0);
5022: TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
5023: if (adp->ad_state & DEPCOMPLETE)
5024: continue;
5025: bp = adp->ad_buf;
5026: if (getdirtybuf(bp, waitfor) == 0) {
5027: if (waitfor == MNT_NOWAIT)
5028: continue;
5029: break;
5030: }
5031: FREE_LOCK(&lk);
5032: if (waitfor == MNT_NOWAIT) {
5033: bawrite(bp);
5034: } else if ((error = VOP_BWRITE(bp)) != 0) {
5035: ACQUIRE_LOCK(&lk);
5036: return (error);
5037: }
5038: ACQUIRE_LOCK(&lk);
5039: break;
5040: }
5041: if (adp != NULL)
5042: continue;
5043: TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
5044: if (adp->ad_state & DEPCOMPLETE)
5045: continue;
5046: bp = adp->ad_buf;
5047: if (getdirtybuf(bp, waitfor) == 0) {
5048: if (waitfor == MNT_NOWAIT)
5049: continue;
5050: break;
5051: }
5052: FREE_LOCK(&lk);
5053: if (waitfor == MNT_NOWAIT) {
5054: bawrite(bp);
5055: } else if ((error = VOP_BWRITE(bp)) != 0) {
5056: ACQUIRE_LOCK(&lk);
5057: return (error);
5058: }
5059: ACQUIRE_LOCK(&lk);
5060: break;
5061: }
5062: if (adp != NULL)
5063: continue;
5064: /*
5065: * If pass2, we are done, otherwise do pass 2.
5066: */
5067: if (waitfor == MNT_WAIT)
5068: break;
5069: waitfor = MNT_WAIT;
5070: }
5071: /*
5072: * Try freeing inodedep in case all dependencies have been removed.
5073: */
5074: if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
5075: (void) free_inodedep(inodedep);
5076: return (0);
5077: }
5078:
5079: /*
5080: * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5081: * Called with splbio blocked.
5082: */
5083: STATIC int
5084: flush_pagedep_deps(pvp, mp, diraddhdp)
5085: struct vnode *pvp;
5086: struct mount *mp;
5087: struct diraddhd *diraddhdp;
5088: {
5089: struct proc *p = CURPROC; /* XXX */
5090: struct worklist *wk;
5091: struct inodedep *inodedep;
5092: struct ufsmount *ump;
5093: struct diradd *dap;
5094: struct vnode *vp;
5095: int gotit, error = 0;
5096: struct buf *bp;
5097: ino_t inum;
5098:
5099: splassert(IPL_BIO);
5100:
5101: ump = VFSTOUFS(mp);
5102: while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5103: /*
5104: * Flush ourselves if this directory entry
5105: * has a MKDIR_PARENT dependency.
5106: */
5107: if (dap->da_state & MKDIR_PARENT) {
5108: FREE_LOCK(&lk);
5109: if ((error = UFS_UPDATE(VTOI(pvp), MNT_WAIT)))
5110: break;
5111: ACQUIRE_LOCK(&lk);
5112: /*
5113: * If that cleared dependencies, go on to next.
5114: */
5115: if (dap != LIST_FIRST(diraddhdp))
5116: continue;
5117: if (dap->da_state & MKDIR_PARENT) {
5118: FREE_LOCK(&lk);
5119: panic("flush_pagedep_deps: MKDIR_PARENT");
5120: }
5121: }
5122: /*
5123: * A newly allocated directory must have its "." and
5124: * ".." entries written out before its name can be
5125: * committed in its parent. We do not want or need
5126: * the full semantics of a synchronous VOP_FSYNC as
5127: * that may end up here again, once for each directory
5128: * level in the filesystem. Instead, we push the blocks
5129: * and wait for them to clear. We have to fsync twice
5130: * because the first call may choose to defer blocks
5131: * that still have dependencies, but deferral will
5132: * happen at most once.
5133: */
5134: inum = dap->da_newinum;
5135: if (dap->da_state & MKDIR_BODY) {
5136: FREE_LOCK(&lk);
5137: if ((error = VFS_VGET(mp, inum, &vp)) != 0)
5138: break;
5139: if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
5140: (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
5141: vput(vp);
5142: break;
5143: }
5144: drain_output(vp, 0);
5145: /*
5146: * If first block is still dirty with a D_MKDIR
5147: * dependency then it needs to be written now.
5148: */
5149: for (;;) {
5150: error = 0;
5151: ACQUIRE_LOCK(&lk);
5152: bp = incore(vp, 0);
5153: if (bp == NULL) {
5154: FREE_LOCK(&lk);
5155: break;
5156: }
5157: LIST_FOREACH(wk, &bp->b_dep, wk_list)
5158: if (wk->wk_type == D_MKDIR)
5159: break;
5160: if (wk) {
5161: gotit = getdirtybuf(bp, MNT_WAIT);
5162: FREE_LOCK(&lk);
5163: if (gotit && (error = bwrite(bp)) != 0)
5164: break;
5165: } else
5166: FREE_LOCK(&lk);
5167: break;
5168: }
5169: vput(vp);
5170: /* Flushing of first block failed */
5171: if (error)
5172: break;
5173: ACQUIRE_LOCK(&lk);
5174: /*
5175: * If that cleared dependencies, go on to next.
5176: */
5177: if (dap != LIST_FIRST(diraddhdp))
5178: continue;
5179: if (dap->da_state & MKDIR_BODY) {
5180: FREE_LOCK(&lk);
5181: panic("flush_pagedep_deps: MKDIR_BODY");
5182: }
5183: }
5184: /*
5185: * Flush the inode on which the directory entry depends.
5186: * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5187: * the only remaining dependency is that the updated inode
5188: * count must get pushed to disk. The inode has already
5189: * been pushed into its inode buffer (via VOP_UPDATE) at
5190: * the time of the reference count change. So we need only
5191: * locate that buffer, ensure that there will be no rollback
5192: * caused by a bitmap dependency, then write the inode buffer.
5193: */
5194: if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
5195: FREE_LOCK(&lk);
5196: panic("flush_pagedep_deps: lost inode");
5197: }
5198: /*
5199: * If the inode still has bitmap dependencies,
5200: * push them to disk.
5201: */
5202: if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5203: bp = inodedep->id_buf;
5204: gotit = getdirtybuf(bp, MNT_WAIT);
5205: FREE_LOCK(&lk);
5206: if (gotit && (error = bwrite(bp)) != 0)
5207: break;
5208: ACQUIRE_LOCK(&lk);
5209: if (dap != LIST_FIRST(diraddhdp))
5210: continue;
5211: }
5212: /*
5213: * If the inode is still sitting in a buffer waiting
5214: * to be written, push it to disk.
5215: */
5216: FREE_LOCK(&lk);
5217: if ((error = bread(ump->um_devvp,
5218: fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5219: (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5220: brelse(bp);
5221: break;
5222: }
5223: if ((error = bwrite(bp)) != 0)
5224: break;
5225: ACQUIRE_LOCK(&lk);
5226: /*
5227: * If we have failed to get rid of all the dependencies
5228: * then something is seriously wrong.
5229: */
5230: if (dap == LIST_FIRST(diraddhdp)) {
5231: FREE_LOCK(&lk);
5232: panic("flush_pagedep_deps: flush failed");
5233: }
5234: }
5235: if (error)
5236: ACQUIRE_LOCK(&lk);
5237: return (error);
5238: }
5239:
5240: /*
5241: * A large burst of file addition or deletion activity can drive the
5242: * memory load excessively high. First attempt to slow things down
5243: * using the techniques below. If that fails, this routine requests
5244: * the offending operations to fall back to running synchronously
5245: * until the memory load returns to a reasonable level.
5246: */
5247: int
5248: softdep_slowdown(vp)
5249: struct vnode *vp;
5250: {
5251: int max_softdeps_hard;
5252:
5253: max_softdeps_hard = max_softdeps * 11 / 10;
5254: if (num_dirrem < max_softdeps_hard / 2 &&
5255: num_inodedep < max_softdeps_hard)
5256: return (0);
5257: stat_sync_limit_hit += 1;
5258: return (1);
5259: }
5260:
5261: /*
5262: * If memory utilization has gotten too high, deliberately slow things
5263: * down and speed up the I/O processing.
5264: */
5265: STATIC int
5266: request_cleanup(resource, islocked)
5267: int resource;
5268: int islocked;
5269: {
5270: struct proc *p = CURPROC;
5271: int s;
5272:
5273: /*
5274: * We never hold up the filesystem syncer process.
5275: */
5276: if (p == filesys_syncer || (p->p_flag & P_SOFTDEP))
5277: return (0);
5278: /*
5279: * First check to see if the work list has gotten backlogged.
5280: * If it has, co-opt this process to help clean up two entries.
5281: * Because this process may hold inodes locked, we cannot
5282: * handle any remove requests that might block on a locked
5283: * inode as that could lead to deadlock. We set P_SOFTDEP
5284: * to avoid recursively processing the worklist.
5285: */
5286: if (num_on_worklist > max_softdeps / 10) {
5287: atomic_setbits_int(&p->p_flag, P_SOFTDEP);
5288: if (islocked)
5289: FREE_LOCK(&lk);
5290: process_worklist_item(NULL, LK_NOWAIT);
5291: process_worklist_item(NULL, LK_NOWAIT);
5292: atomic_clearbits_int(&p->p_flag, P_SOFTDEP);
5293: stat_worklist_push += 2;
5294: if (islocked)
5295: ACQUIRE_LOCK(&lk);
5296: return(1);
5297: }
5298: /*
5299: * Next, we attempt to speed up the syncer process. If that
5300: * is successful, then we allow the process to continue.
5301: */
5302: if (speedup_syncer())
5303: return(0);
5304: /*
5305: * If we are resource constrained on inode dependencies, try
5306: * flushing some dirty inodes. Otherwise, we are constrained
5307: * by file deletions, so try accelerating flushes of directories
5308: * with removal dependencies. We would like to do the cleanup
5309: * here, but we probably hold an inode locked at this point and
5310: * that might deadlock against one that we try to clean. So,
5311: * the best that we can do is request the syncer daemon to do
5312: * the cleanup for us.
5313: */
5314: switch (resource) {
5315:
5316: case FLUSH_INODES:
5317: stat_ino_limit_push += 1;
5318: req_clear_inodedeps += 1;
5319: stat_countp = &stat_ino_limit_hit;
5320: break;
5321:
5322: case FLUSH_REMOVE:
5323: stat_blk_limit_push += 1;
5324: req_clear_remove += 1;
5325: stat_countp = &stat_blk_limit_hit;
5326: break;
5327:
5328: default:
5329: if (islocked)
5330: FREE_LOCK(&lk);
5331: panic("request_cleanup: unknown type");
5332: }
5333: /*
5334: * Hopefully the syncer daemon will catch up and awaken us.
5335: * We wait at most tickdelay before proceeding in any case.
5336: */
5337: if (islocked == 0)
5338: ACQUIRE_LOCK(&lk);
5339: proc_waiting += 1;
5340: if (!timeout_pending(&proc_waiting_timeout))
5341: timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
5342:
5343: s = FREE_LOCK_INTERLOCKED(&lk);
5344: (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
5345: ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5346: proc_waiting -= 1;
5347: if (islocked == 0)
5348: FREE_LOCK(&lk);
5349: return (1);
5350: }
5351:
5352: /*
5353: * Awaken processes pausing in request_cleanup and clear proc_waiting
5354: * to indicate that there is no longer a timer running.
5355: */
5356: void
5357: pause_timer(arg)
5358: void *arg;
5359: {
5360:
5361: *stat_countp += 1;
5362: wakeup_one(&proc_waiting);
5363: if (proc_waiting > 0)
5364: timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
5365: }
5366:
5367: /*
5368: * Flush out a directory with at least one removal dependency in an effort to
5369: * reduce the number of dirrem, freefile, and freeblks dependency structures.
5370: */
5371: STATIC void
5372: clear_remove(p)
5373: struct proc *p;
5374: {
5375: struct pagedep_hashhead *pagedephd;
5376: struct pagedep *pagedep;
5377: static int next = 0;
5378: struct mount *mp;
5379: struct vnode *vp;
5380: int error, cnt;
5381: ino_t ino;
5382:
5383: ACQUIRE_LOCK(&lk);
5384: for (cnt = 0; cnt < pagedep_hash; cnt++) {
5385: pagedephd = &pagedep_hashtbl[next++];
5386: if (next >= pagedep_hash)
5387: next = 0;
5388: LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5389: if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
5390: continue;
5391: mp = pagedep->pd_mnt;
5392: ino = pagedep->pd_ino;
5393: #if 0
5394: if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5395: continue;
5396: #endif
5397: FREE_LOCK(&lk);
5398: if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
5399: softdep_error("clear_remove: vget", error);
5400: #if 0
5401: vn_finished_write(mp);
5402: #endif
5403: return;
5404: }
5405: if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
5406: softdep_error("clear_remove: fsync", error);
5407: drain_output(vp, 0);
5408: vput(vp);
5409: #if 0
5410: vn_finished_write(mp);
5411: #endif
5412: return;
5413: }
5414: }
5415: FREE_LOCK(&lk);
5416: }
5417:
5418: /*
5419: * Clear out a block of dirty inodes in an effort to reduce
5420: * the number of inodedep dependency structures.
5421: */
5422: STATIC void
5423: clear_inodedeps(p)
5424: struct proc *p;
5425: {
5426: struct inodedep_hashhead *inodedephd;
5427: struct inodedep *inodedep;
5428: static int next = 0;
5429: struct mount *mp;
5430: struct vnode *vp;
5431: struct fs *fs;
5432: int error, cnt;
5433: ino_t firstino, lastino, ino;
5434:
5435: ACQUIRE_LOCK(&lk);
5436: /*
5437: * Pick a random inode dependency to be cleared.
5438: * We will then gather up all the inodes in its block
5439: * that have dependencies and flush them out.
5440: */
5441: for (cnt = 0; cnt < inodedep_hash; cnt++) {
5442: inodedephd = &inodedep_hashtbl[next++];
5443: if (next >= inodedep_hash)
5444: next = 0;
5445: if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5446: break;
5447: }
5448: if (inodedep == NULL) {
5449: FREE_LOCK(&lk);
5450: return;
5451: }
5452: /*
5453: * Ugly code to find mount point given pointer to superblock.
5454: */
5455: fs = inodedep->id_fs;
5456: CIRCLEQ_FOREACH(mp, &mountlist, mnt_list)
5457: if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
5458: break;
5459: /*
5460: * Find the last inode in the block with dependencies.
5461: */
5462: firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5463: for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5464: if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
5465: break;
5466: /*
5467: * Asynchronously push all but the last inode with dependencies.
5468: * Synchronously push the last inode with dependencies to ensure
5469: * that the inode block gets written to free up the inodedeps.
5470: */
5471: for (ino = firstino; ino <= lastino; ino++) {
5472: if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5473: continue;
5474: FREE_LOCK(&lk);
5475: #if 0
5476: if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5477: continue;
5478: #endif
5479: if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
5480: softdep_error("clear_inodedeps: vget", error);
5481: #if 0
5482: vn_finished_write(mp);
5483: #endif
5484: return;
5485: }
5486: if (ino == lastino) {
5487: if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
5488: softdep_error("clear_inodedeps: fsync1", error);
5489: } else {
5490: if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
5491: softdep_error("clear_inodedeps: fsync2", error);
5492: drain_output(vp, 0);
5493: }
5494: vput(vp);
5495: #if 0
5496: vn_finished_write(mp);
5497: #endif
5498: ACQUIRE_LOCK(&lk);
5499: }
5500: FREE_LOCK(&lk);
5501: }
5502:
5503: /*
5504: * Function to determine if the buffer has outstanding dependencies
5505: * that will cause a roll-back if the buffer is written. If wantcount
5506: * is set, return number of dependencies, otherwise just yes or no.
5507: */
5508: int
5509: softdep_count_dependencies(bp, wantcount, islocked)
5510: struct buf *bp;
5511: int wantcount;
5512: int islocked;
5513: {
5514: struct worklist *wk;
5515: struct inodedep *inodedep;
5516: struct indirdep *indirdep;
5517: struct allocindir *aip;
5518: struct pagedep *pagedep;
5519: struct diradd *dap;
5520: int i, retval;
5521:
5522: retval = 0;
5523: if (!islocked)
5524: ACQUIRE_LOCK(&lk);
5525: LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5526: switch (wk->wk_type) {
5527:
5528: case D_INODEDEP:
5529: inodedep = WK_INODEDEP(wk);
5530: if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5531: /* bitmap allocation dependency */
5532: retval += 1;
5533: if (!wantcount)
5534: goto out;
5535: }
5536: if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
5537: /* direct block pointer dependency */
5538: retval += 1;
5539: if (!wantcount)
5540: goto out;
5541: }
5542: continue;
5543:
5544: case D_INDIRDEP:
5545: indirdep = WK_INDIRDEP(wk);
5546:
5547: LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
5548: /* indirect block pointer dependency */
5549: retval += 1;
5550: if (!wantcount)
5551: goto out;
5552: }
5553: continue;
5554:
5555: case D_PAGEDEP:
5556: pagedep = WK_PAGEDEP(wk);
5557: for (i = 0; i < DAHASHSZ; i++) {
5558:
5559: LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
5560: /* directory entry dependency */
5561: retval += 1;
5562: if (!wantcount)
5563: goto out;
5564: }
5565: }
5566: continue;
5567:
5568: case D_BMSAFEMAP:
5569: case D_ALLOCDIRECT:
5570: case D_ALLOCINDIR:
5571: case D_MKDIR:
5572: /* never a dependency on these blocks */
5573: continue;
5574:
5575: default:
5576: if (!islocked)
5577: FREE_LOCK(&lk);
5578: panic("softdep_check_for_rollback: Unexpected type %s",
5579: TYPENAME(wk->wk_type));
5580: /* NOTREACHED */
5581: }
5582: }
5583: out:
5584: if (!islocked)
5585: FREE_LOCK(&lk);
5586: return retval;
5587: }
5588:
5589: /*
5590: * Acquire exclusive access to a buffer.
5591: * Must be called with splbio blocked.
5592: * Return 1 if buffer was acquired.
5593: */
5594: STATIC int
5595: getdirtybuf(bp, waitfor)
5596: struct buf *bp;
5597: int waitfor;
5598: {
5599: int s;
5600:
5601: if (bp == NULL)
5602: return (0);
5603:
5604: splassert(IPL_BIO);
5605:
5606: for (;;) {
5607: if ((bp->b_flags & B_BUSY) == 0)
5608: break;
5609: if (waitfor != MNT_WAIT)
5610: return (0);
5611: bp->b_flags |= B_WANTED;
5612: s = FREE_LOCK_INTERLOCKED(&lk);
5613: tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
5614: ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5615: }
5616: if ((bp->b_flags & B_DELWRI) == 0)
5617: return (0);
5618: bremfree(bp);
5619: bp->b_flags |= B_BUSY;
5620: return (1);
5621: }
5622:
5623: /*
5624: * Wait for pending output on a vnode to complete.
5625: * Must be called with vnode locked.
5626: */
5627: STATIC void
5628: drain_output(vp, islocked)
5629: struct vnode *vp;
5630: int islocked;
5631: {
5632: int s;
5633:
5634: if (!islocked)
5635: ACQUIRE_LOCK(&lk);
5636:
5637: splassert(IPL_BIO);
5638:
5639: while (vp->v_numoutput) {
5640: vp->v_bioflag |= VBIOWAIT;
5641: s = FREE_LOCK_INTERLOCKED(&lk);
5642: tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drain_output", 0);
5643: ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5644: }
5645: if (!islocked)
5646: FREE_LOCK(&lk);
5647: }
5648:
5649: /*
5650: * Called whenever a buffer that is being invalidated or reallocated
5651: * contains dependencies. This should only happen if an I/O error has
5652: * occurred. The routine is called with the buffer locked.
5653: */
5654: void
5655: softdep_deallocate_dependencies(bp)
5656: struct buf *bp;
5657: {
5658:
5659: if ((bp->b_flags & B_ERROR) == 0)
5660: panic("softdep_deallocate_dependencies: dangling deps");
5661: softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5662: panic("softdep_deallocate_dependencies: unrecovered I/O error");
5663: }
5664:
5665: /*
5666: * Function to handle asynchronous write errors in the filesystem.
5667: */
5668: void
5669: softdep_error(func, error)
5670: char *func;
5671: int error;
5672: {
5673:
5674: /* XXX should do something better! */
5675: printf("%s: got error %d while accessing filesystem\n", func, error);
5676: }
5677:
5678: #ifdef DDB
5679: #include <machine/db_machdep.h>
5680: #include <ddb/db_interface.h>
5681: #include <ddb/db_output.h>
5682:
5683: void
5684: softdep_print(struct buf *bp, int full, int (*pr)(const char *, ...))
5685: {
5686: struct worklist *wk;
5687:
5688: (*pr)(" deps:\n");
5689: LIST_FOREACH(wk, &bp->b_dep, wk_list)
5690: worklist_print(wk, full, pr);
5691: }
5692:
5693: void
5694: worklist_print(struct worklist *wk, int full, int (*pr)(const char *, ...))
5695: {
5696: struct pagedep *pagedep;
5697: struct inodedep *inodedep;
5698: struct newblk *newblk;
5699: struct bmsafemap *bmsafemap;
5700: struct allocdirect *adp;
5701: struct indirdep *indirdep;
5702: struct allocindir *aip;
5703: struct freefrag *freefrag;
5704: struct freeblks *freeblks;
5705: struct freefile *freefile;
5706: struct diradd *dap;
5707: struct mkdir *mkdir;
5708: struct dirrem *dirrem;
5709: struct newdirblk *newdirblk;
5710: char prefix[33];
5711: int i;
5712:
5713: for (prefix[i = 2 * MIN(16, full)] = '\0'; i--; prefix[i] = ' ')
5714: ;
5715:
5716: (*pr)("%s%s(%p) state %b\n%s", prefix, TYPENAME(wk->wk_type), wk,
5717: wk->wk_state, DEP_BITS, prefix);
5718: switch (wk->wk_type) {
5719: case D_PAGEDEP:
5720: pagedep = WK_PAGEDEP(wk);
5721: (*pr)("mount %p ino %u lbn %lld\n", pagedep->pd_mnt,
5722: pagedep->pd_ino, pagedep->pd_lbn);
5723: break;
5724: case D_INODEDEP:
5725: inodedep = WK_INODEDEP(wk);
5726: (*pr)("fs %p ino %u nlinkdelta %u dino %p\n"
5727: "%s bp %p savsz %lld\n", inodedep->id_fs,
5728: inodedep->id_ino, inodedep->id_nlinkdelta,
5729: inodedep->id_un.idu_savedino1,
5730: prefix, inodedep->id_buf, inodedep->id_savedsize);
5731: break;
5732: case D_NEWBLK:
5733: newblk = WK_NEWBLK(wk);
5734: (*pr)("fs %p newblk %d state %d bmsafemap %p\n",
5735: newblk->nb_fs, newblk->nb_newblkno, newblk->nb_state,
5736: newblk->nb_bmsafemap);
5737: break;
5738: case D_BMSAFEMAP:
5739: bmsafemap = WK_BMSAFEMAP(wk);
5740: (*pr)("buf %p\n", bmsafemap->sm_buf);
5741: break;
5742: case D_ALLOCDIRECT:
5743: adp = WK_ALLOCDIRECT(wk);
5744: (*pr)("lbn %lld newlbk %d oldblk %d newsize %lu olsize %lu\n"
5745: "%s bp %p inodedep %p freefrag %p\n", adp->ad_lbn,
5746: adp->ad_newblkno, adp->ad_oldblkno, adp->ad_newsize,
5747: adp->ad_oldsize,
5748: prefix, adp->ad_buf, adp->ad_inodedep, adp->ad_freefrag);
5749: break;
5750: case D_INDIRDEP:
5751: indirdep = WK_INDIRDEP(wk);
5752: (*pr)("savedata %p savebp %p\n", indirdep->ir_saveddata,
5753: indirdep->ir_savebp);
5754: break;
5755: case D_ALLOCINDIR:
5756: aip = WK_ALLOCINDIR(wk);
5757: (*pr)("off %d newblk %d oldblk %d freefrag %p\n"
5758: "%s indirdep %p buf %p\n", aip->ai_offset,
5759: aip->ai_newblkno, aip->ai_oldblkno, aip->ai_freefrag,
5760: prefix, aip->ai_indirdep, aip->ai_buf);
5761: break;
5762: case D_FREEFRAG:
5763: freefrag = WK_FREEFRAG(wk);
5764: (*pr)("vnode %p mp %p blkno %d fsize %ld ino %u\n",
5765: freefrag->ff_devvp, freefrag->ff_mnt, freefrag->ff_blkno,
5766: freefrag->ff_fragsize, freefrag->ff_inum);
5767: break;
5768: case D_FREEBLKS:
5769: freeblks = WK_FREEBLKS(wk);
5770: (*pr)("previno %u devvp %p mp %p oldsz %lld newsz %lld\n"
5771: "%s chkcnt %d uid %d\n", freeblks->fb_previousinum,
5772: freeblks->fb_devvp, freeblks->fb_mnt, freeblks->fb_oldsize,
5773: freeblks->fb_newsize,
5774: prefix, freeblks->fb_chkcnt, freeblks->fb_uid);
5775: break;
5776: case D_FREEFILE:
5777: freefile = WK_FREEFILE(wk);
5778: (*pr)("mode %x oldino %u vnode %p mp %p\n", freefile->fx_mode,
5779: freefile->fx_oldinum, freefile->fx_devvp, freefile->fx_mnt);
5780: break;
5781: case D_DIRADD:
5782: dap = WK_DIRADD(wk);
5783: (*pr)("off %ld ino %u da_un %p\n", dap->da_offset,
5784: dap->da_newinum, dap->da_un.dau_previous);
5785: break;
5786: case D_MKDIR:
5787: mkdir = WK_MKDIR(wk);
5788: (*pr)("diradd %p bp %p\n", mkdir->md_diradd, mkdir->md_buf);
5789: break;
5790: case D_DIRREM:
5791: dirrem = WK_DIRREM(wk);
5792: (*pr)("mp %p ino %u dm_un %p\n", dirrem->dm_mnt,
5793: dirrem->dm_oldinum, dirrem->dm_un.dmu_pagedep);
5794: break;
5795: case D_NEWDIRBLK:
5796: newdirblk = WK_NEWDIRBLK(wk);
5797: (*pr)("pagedep %p\n", newdirblk->db_pagedep);
5798: break;
5799: }
5800: }
5801: #endif
CVSweb