Version:
~ [ 0.6-2.3.46 ] ~
Architecture:
~ [ um ] ~
** Warning: Cannot open xref database.
1 /*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
11 */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17 */
18
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
21 */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
25 */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/sched.h>
32 #include <linux/fs.h>
33 #include <linux/malloc.h>
34 #include <linux/locks.h>
35 #include <linux/errno.h>
36 #include <linux/swap.h>
37 #include <linux/smp_lock.h>
38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h>
40 #include <linux/sysrq.h>
41 #include <linux/file.h>
42 #include <linux/init.h>
43 #include <linux/quotaops.h>
44 #include <linux/iobuf.h>
45 #include <linux/highmem.h>
46
47 #include <asm/uaccess.h>
48 #include <asm/io.h>
49 #include <asm/bitops.h>
50 #include <asm/mmu_context.h>
51
52 #define NR_SIZES 7
53 static char buffersize_index[65] =
54 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
55 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
56 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 6};
59
60 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
61 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
62 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
63 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
64 number of unused buffer heads */
65
66 /* Anti-deadlock ordering:
67 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
68 */
69
70 /*
71 * Hash table gook..
72 */
73 static unsigned int bh_hash_mask = 0;
74 static unsigned int bh_hash_shift = 0;
75 static struct buffer_head **hash_table;
76 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
77
78 static struct buffer_head *lru_list[NR_LIST];
79 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
80 static int nr_buffers_type[NR_LIST] = {0,};
81 static unsigned long size_buffers_type[NR_LIST] = {0,};
82
83 static struct buffer_head * unused_list = NULL;
84 static int nr_unused_buffer_heads = 0;
85 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
86 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
87
88 struct bh_free_head {
89 struct buffer_head *list;
90 spinlock_t lock;
91 };
92 static struct bh_free_head free_list[NR_SIZES];
93
94 kmem_cache_t *bh_cachep;
95
96 static int grow_buffers(int size);
97 static void __refile_buffer(struct buffer_head *);
98
99 /* This is used by some architectures to estimate available memory. */
100 atomic_t buffermem_pages = ATOMIC_INIT(0);
101
102 /* Here is the parameter block for the bdflush process. If you add or
103 * remove any of the parameters, make sure to update kernel/sysctl.c.
104 */
105
106 #define N_PARAM 9
107
108 /* The dummy values in this structure are left in there for compatibility
109 * with old programs that play with the /proc entries.
110 */
111 union bdflush_param {
112 struct {
113 int nfract; /* Percentage of buffer cache dirty to
114 activate bdflush */
115 int ndirty; /* Maximum number of dirty blocks to write out per
116 wake-cycle */
117 int nrefill; /* Number of clean buffers to try to obtain
118 each time we call refill */
119 int nref_dirt; /* Dirty buffer threshold for activating bdflush
120 when trying to refill buffers. */
121 int interval; /* jiffies delay between kupdate flushes */
122 int age_buffer; /* Time for normal buffer to age before we flush it */
123 int age_super; /* Time for superblock to age before we flush it */
124 int dummy2; /* unused */
125 int dummy3; /* unused */
126 } b_un;
127 unsigned int data[N_PARAM];
128 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
129
130 /* These are the min and max parameter values that we will allow to be assigned */
131 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
132 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
133
134 /*
135 * Rewrote the wait-routines to use the "new" wait-queue functionality,
136 * and getting rid of the cli-sti pairs. The wait-queue routines still
137 * need cli-sti, but now it's just a couple of 386 instructions or so.
138 *
139 * Note that the real wait_on_buffer() is an inline function that checks
140 * if 'b_wait' is set before calling this, so that the queues aren't set
141 * up unnecessarily.
142 */
143 void __wait_on_buffer(struct buffer_head * bh)
144 {
145 struct task_struct *tsk = current;
146 DECLARE_WAITQUEUE(wait, tsk);
147
148 atomic_inc(&bh->b_count);
149 add_wait_queue(&bh->b_wait, &wait);
150 repeat:
151 run_task_queue(&tq_disk);
152 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
153 if (buffer_locked(bh)) {
154 schedule();
155 goto repeat;
156 }
157 tsk->state = TASK_RUNNING;
158 remove_wait_queue(&bh->b_wait, &wait);
159 atomic_dec(&bh->b_count);
160 }
161
162 /* Call sync_buffers with wait!=0 to ensure that the call does not
163 * return until all buffer writes have completed. Sync() may return
164 * before the writes have finished; fsync() may not.
165 */
166
167 /* Godamity-damn. Some buffers (bitmaps for filesystems)
168 * spontaneously dirty themselves without ever brelse being called.
169 * We will ultimately want to put these in a separate list, but for
170 * now we search all of the lists for dirty buffers.
171 */
172 static int sync_buffers(kdev_t dev, int wait)
173 {
174 int i, retry, pass = 0, err = 0;
175 struct buffer_head * bh, *next;
176
177 /* One pass for no-wait, three for wait:
178 * 0) write out all dirty, unlocked buffers;
179 * 1) write out all dirty buffers, waiting if locked;
180 * 2) wait for completion by waiting for all buffers to unlock.
181 */
182 do {
183 retry = 0;
184
185 /* We search all lists as a failsafe mechanism, not because we expect
186 * there to be dirty buffers on any of the other lists.
187 */
188 repeat:
189 spin_lock(&lru_list_lock);
190 bh = lru_list[BUF_DIRTY];
191 if (!bh)
192 goto repeat2;
193
194 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
195 next = bh->b_next_free;
196
197 if (!lru_list[BUF_DIRTY])
198 break;
199 if (dev && bh->b_dev != dev)
200 continue;
201 if (buffer_locked(bh)) {
202 /* Buffer is locked; skip it unless wait is
203 * requested AND pass > 0.
204 */
205 if (!wait || !pass) {
206 retry = 1;
207 continue;
208 }
209 atomic_inc(&bh->b_count);
210 spin_unlock(&lru_list_lock);
211 wait_on_buffer (bh);
212 atomic_dec(&bh->b_count);
213 goto repeat;
214 }
215
216 /* If an unlocked buffer is not uptodate, there has
217 * been an IO error. Skip it.
218 */
219 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
220 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
221 err = -EIO;
222 continue;
223 }
224
225 /* Don't write clean buffers. Don't write ANY buffers
226 * on the third pass.
227 */
228 if (!buffer_dirty(bh) || pass >= 2)
229 continue;
230
231 atomic_inc(&bh->b_count);
232 spin_unlock(&lru_list_lock);
233 ll_rw_block(WRITE, 1, &bh);
234 atomic_dec(&bh->b_count);
235 retry = 1;
236 goto repeat;
237 }
238
239 repeat2:
240 bh = lru_list[BUF_LOCKED];
241 if (!bh) {
242 spin_unlock(&lru_list_lock);
243 break;
244 }
245 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
246 next = bh->b_next_free;
247
248 if (!lru_list[BUF_LOCKED])
249 break;
250 if (dev && bh->b_dev != dev)
251 continue;
252 if (buffer_locked(bh)) {
253 /* Buffer is locked; skip it unless wait is
254 * requested AND pass > 0.
255 */
256 if (!wait || !pass) {
257 retry = 1;
258 continue;
259 }
260 atomic_inc(&bh->b_count);
261 spin_unlock(&lru_list_lock);
262 wait_on_buffer (bh);
263 spin_lock(&lru_list_lock);
264 atomic_dec(&bh->b_count);
265 goto repeat2;
266 }
267 }
268 spin_unlock(&lru_list_lock);
269
270 /* If we are waiting for the sync to succeed, and if any dirty
271 * blocks were written, then repeat; on the second pass, only
272 * wait for buffers being written (do not pass to write any
273 * more buffers on the second pass).
274 */
275 } while (wait && retry && ++pass<=2);
276 return err;
277 }
278
279 void sync_dev(kdev_t dev)
280 {
281 sync_supers(dev);
282 sync_inodes(dev);
283 DQUOT_SYNC(dev);
284 /* sync all the dirty buffers out to disk only _after_ all the
285 high level layers finished generated buffer dirty data
286 (or we'll return with some buffer still dirty on the blockdevice
287 so breaking the semantics of this call) */
288 sync_buffers(dev, 0);
289 /*
290 * FIXME(eric) we need to sync the physical devices here.
291 * This is because some (scsi) controllers have huge amounts of
292 * cache onboard (hundreds of Mb), and we need to instruct
293 * them to commit all of the dirty memory to disk, and we should
294 * not return until this has happened.
295 *
296 * This would need to get implemented by going through the assorted
297 * layers so that each block major number can be synced, and this
298 * would call down into the upper and mid-layer scsi.
299 */
300 }
301
302 int fsync_dev(kdev_t dev)
303 {
304 sync_buffers(dev, 0);
305
306 lock_kernel();
307 sync_supers(dev);
308 sync_inodes(dev);
309 DQUOT_SYNC(dev);
310 unlock_kernel();
311
312 return sync_buffers(dev, 1);
313 }
314
315 asmlinkage long sys_sync(void)
316 {
317 fsync_dev(0);
318 return 0;
319 }
320
321 /*
322 * filp may be NULL if called via the msync of a vma.
323 */
324
325 int file_fsync(struct file *filp, struct dentry *dentry)
326 {
327 struct inode * inode = dentry->d_inode;
328 struct super_block * sb;
329 kdev_t dev;
330 int ret;
331
332 lock_kernel();
333 /* sync the inode to buffers */
334 write_inode_now(inode);
335
336 /* sync the superblock to buffers */
337 sb = inode->i_sb;
338 wait_on_super(sb);
339 if (sb->s_op && sb->s_op->write_super)
340 sb->s_op->write_super(sb);
341
342 /* .. finally sync the buffers to disk */
343 dev = inode->i_dev;
344 ret = sync_buffers(dev, 1);
345 unlock_kernel();
346 return ret;
347 }
348
349 asmlinkage long sys_fsync(unsigned int fd)
350 {
351 struct file * file;
352 struct dentry * dentry;
353 struct inode * inode;
354 int err;
355
356 err = -EBADF;
357 file = fget(fd);
358 if (!file)
359 goto out;
360
361 dentry = file->f_dentry;
362 if (!dentry)
363 goto out_putf;
364
365 inode = dentry->d_inode;
366 if (!inode)
367 goto out_putf;
368
369 err = -EINVAL;
370 if (!file->f_op || !file->f_op->fsync)
371 goto out_putf;
372
373 /* We need to protect against concurrent writers.. */
374 down(&inode->i_sem);
375 err = file->f_op->fsync(file, dentry);
376 up(&inode->i_sem);
377
378 out_putf:
379 fput(file);
380 out:
381 return err;
382 }
383
384 asmlinkage long sys_fdatasync(unsigned int fd)
385 {
386 struct file * file;
387 struct dentry * dentry;
388 struct inode * inode;
389 int err;
390
391 err = -EBADF;
392 file = fget(fd);
393 if (!file)
394 goto out;
395
396 dentry = file->f_dentry;
397 if (!dentry)
398 goto out_putf;
399
400 inode = dentry->d_inode;
401 if (!inode)
402 goto out_putf;
403
404 err = -EINVAL;
405 if (!file->f_op || !file->f_op->fsync)
406 goto out_putf;
407
408 /* this needs further work, at the moment it is identical to fsync() */
409 down(&inode->i_sem);
410 err = file->f_op->fsync(file, dentry);
411 up(&inode->i_sem);
412
413 out_putf:
414 fput(file);
415 out:
416 return err;
417 }
418
419 /* After several hours of tedious analysis, the following hash
420 * function won. Do not mess with it... -DaveM
421 */
422 #define _hashfn(dev,block) \
423 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
424 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
425 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
426
427 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
428 {
429 if ((bh->b_next = *head) != NULL)
430 bh->b_next->b_pprev = &bh->b_next;
431 *head = bh;
432 bh->b_pprev = head;
433 }
434
435 static __inline__ void __hash_unlink(struct buffer_head *bh)
436 {
437 if (bh->b_pprev) {
438 if (bh->b_next)
439 bh->b_next->b_pprev = bh->b_pprev;
440 *(bh->b_pprev) = bh->b_next;
441 bh->b_pprev = NULL;
442 }
443 }
444
445 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
446 {
447 struct buffer_head **bhp = &lru_list[blist];
448
449 if(!*bhp) {
450 *bhp = bh;
451 bh->b_prev_free = bh;
452 }
453 bh->b_next_free = *bhp;
454 bh->b_prev_free = (*bhp)->b_prev_free;
455 (*bhp)->b_prev_free->b_next_free = bh;
456 (*bhp)->b_prev_free = bh;
457 nr_buffers_type[blist]++;
458 size_buffers_type[blist] += bh->b_size;
459 }
460
461 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
462 {
463 if (bh->b_prev_free || bh->b_next_free) {
464 bh->b_prev_free->b_next_free = bh->b_next_free;
465 bh->b_next_free->b_prev_free = bh->b_prev_free;
466 if (lru_list[blist] == bh)
467 lru_list[blist] = bh->b_next_free;
468 if (lru_list[blist] == bh)
469 lru_list[blist] = NULL;
470 bh->b_next_free = bh->b_prev_free = NULL;
471 nr_buffers_type[blist]--;
472 size_buffers_type[blist] -= bh->b_size;
473 }
474 }
475
476 static void __remove_from_free_list(struct buffer_head * bh, int index)
477 {
478 if(bh->b_next_free == bh)
479 free_list[index].list = NULL;
480 else {
481 bh->b_prev_free->b_next_free = bh->b_next_free;
482 bh->b_next_free->b_prev_free = bh->b_prev_free;
483 if (free_list[index].list == bh)
484 free_list[index].list = bh->b_next_free;
485 }
486 bh->b_next_free = bh->b_prev_free = NULL;
487 }
488
489 /* must be called with both the hash_table_lock and the lru_list_lock
490 held */
491 static void __remove_from_queues(struct buffer_head *bh)
492 {
493 __hash_unlink(bh);
494 __remove_from_lru_list(bh, bh->b_list);
495 }
496
497 static void insert_into_queues(struct buffer_head *bh)
498 {
499 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
500
501 spin_lock(&lru_list_lock);
502 write_lock(&hash_table_lock);
503 __hash_link(bh, head);
504 __insert_into_lru_list(bh, bh->b_list);
505 write_unlock(&hash_table_lock);
506 spin_unlock(&lru_list_lock);
507 }
508
509 /* This function must only run if there are no other
510 * references _anywhere_ to this buffer head.
511 */
512 static void put_last_free(struct buffer_head * bh)
513 {
514 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
515 struct buffer_head **bhp = &head->list;
516
517 bh->b_state = 0;
518
519 spin_lock(&head->lock);
520 bh->b_dev = B_FREE;
521 if(!*bhp) {
522 *bhp = bh;
523 bh->b_prev_free = bh;
524 }
525 bh->b_next_free = *bhp;
526 bh->b_prev_free = (*bhp)->b_prev_free;
527 (*bhp)->b_prev_free->b_next_free = bh;
528 (*bhp)->b_prev_free = bh;
529 spin_unlock(&head->lock);
530 }
531
532 /*
533 * Why like this, I hear you say... The reason is race-conditions.
534 * As we don't lock buffers (unless we are reading them, that is),
535 * something might happen to it while we sleep (ie a read-error
536 * will force it bad). This shouldn't really happen currently, but
537 * the code is ready.
538 */
539 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
540 {
541 struct buffer_head **head = &hash(dev, block);
542 struct buffer_head *bh;
543
544 read_lock(&hash_table_lock);
545 for(bh = *head; bh; bh = bh->b_next)
546 if (bh->b_blocknr == block &&
547 bh->b_size == size &&
548 bh->b_dev == dev)
549 break;
550 if (bh)
551 atomic_inc(&bh->b_count);
552 read_unlock(&hash_table_lock);
553
554 return bh;
555 }
556
557 unsigned int get_hardblocksize(kdev_t dev)
558 {
559 /*
560 * Get the hard sector size for the given device. If we don't know
561 * what it is, return 0.
562 */
563 if (hardsect_size[MAJOR(dev)] != NULL) {
564 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
565 if (blksize != 0)
566 return blksize;
567 }
568
569 /*
570 * We don't know what the hardware sector size for this device is.
571 * Return 0 indicating that we don't know.
572 */
573 return 0;
574 }
575
576 /* If invalidate_buffers() will trash dirty buffers, it means some kind
577 of fs corruption is going on. Trashing dirty data always imply losing
578 information that was supposed to be just stored on the physical layer
579 by the user.
580
581 Thus invalidate_buffers in general usage is not allwowed to trash dirty
582 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
583
584 NOTE: In the case where the user removed a removable-media-disk even if
585 there's still dirty data not synced on disk (due a bug in the device driver
586 or due an error of the user), by not destroying the dirty buffers we could
587 generate corruption also on the next media inserted, thus a parameter is
588 necessary to handle this case in the most safe way possible (trying
589 to not corrupt also the new disk inserted with the data belonging to
590 the old now corrupted disk). Also for the ramdisk the natural thing
591 to do in order to release the ramdisk memory is to destroy dirty buffers.
592
593 These are two special cases. Normal usage imply the device driver
594 to issue a sync on the device (without waiting I/O completation) and
595 then an invalidate_buffers call that doesn't trashes dirty buffers. */
596 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
597 {
598 int i, nlist, slept;
599 struct buffer_head * bh, * bh_next;
600
601 retry:
602 slept = 0;
603 spin_lock(&lru_list_lock);
604 for(nlist = 0; nlist < NR_LIST; nlist++) {
605 bh = lru_list[nlist];
606 if (!bh)
607 continue;
608 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
609 bh_next = bh->b_next_free;
610 if (bh->b_dev != dev)
611 continue;
612 if (buffer_locked(bh)) {
613 atomic_inc(&bh->b_count);
614 spin_unlock(&lru_list_lock);
615 wait_on_buffer(bh);
616 slept = 1;
617 spin_lock(&lru_list_lock);
618 atomic_dec(&bh->b_count);
619 }
620
621 write_lock(&hash_table_lock);
622 if (!atomic_read(&bh->b_count) &&
623 (destroy_dirty_buffers || !buffer_dirty(bh))) {
624 __remove_from_queues(bh);
625 put_last_free(bh);
626 }
627 write_unlock(&hash_table_lock);
628 if (slept)
629 goto out;
630 }
631 }
632 out:
633 spin_unlock(&lru_list_lock);
634 if (slept)
635 goto retry;
636 }
637
638 void set_blocksize(kdev_t dev, int size)
639 {
640 extern int *blksize_size[];
641 int i, nlist, slept;
642 struct buffer_head * bh, * bh_next;
643
644 if (!blksize_size[MAJOR(dev)])
645 return;
646
647 /* Size must be a power of two, and between 512 and PAGE_SIZE */
648 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
649 panic("Invalid blocksize passed to set_blocksize");
650
651 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
652 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
653 return;
654 }
655 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
656 return;
657 sync_buffers(dev, 2);
658 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
659
660 retry:
661 slept = 0;
662 spin_lock(&lru_list_lock);
663 for(nlist = 0; nlist < NR_LIST; nlist++) {
664 bh = lru_list[nlist];
665 if (!bh)
666 continue;
667 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
668 bh_next = bh->b_next_free;
669 if (bh->b_dev != dev || bh->b_size == size)
670 continue;
671 if (buffer_locked(bh)) {
672 atomic_inc(&bh->b_count);
673 spin_unlock(&lru_list_lock);
674 wait_on_buffer(bh);
675 slept = 1;
676 spin_lock(&lru_list_lock);
677 atomic_dec(&bh->b_count);
678 }
679
680 write_lock(&hash_table_lock);
681 if (!atomic_read(&bh->b_count)) {
682 if (buffer_dirty(bh))
683 printk(KERN_WARNING
684 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
685 kdevname(dev), bh->b_blocknr, bh->b_size);
686 __remove_from_queues(bh);
687 put_last_free(bh);
688 } else {
689 if (atomic_set_buffer_clean(bh))
690 __refile_buffer(bh);
691 clear_bit(BH_Uptodate, &bh->b_state);
692 printk(KERN_WARNING
693 "set_blocksize: "
694 "b_count %d, dev %s, block %lu, from %p\n",
695 atomic_read(&bh->b_count), bdevname(bh->b_dev),
696 bh->b_blocknr, __builtin_return_address(0));
697 }
698 write_unlock(&hash_table_lock);
699 if (slept)
700 goto out;
701 }
702 }
703 out:
704 spin_unlock(&lru_list_lock);
705 if (slept)
706 goto retry;
707 }
708
709 /*
710 * We used to try various strange things. Let's not.
711 */
712 static void refill_freelist(int size)
713 {
714 if (!grow_buffers(size)) {
715 wakeup_bdflush(1);
716 current->policy |= SCHED_YIELD;
717 schedule();
718 }
719 }
720
721 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
722 {
723 bh->b_list = BUF_CLEAN;
724 bh->b_end_io = handler;
725 bh->b_dev_id = dev_id;
726 }
727
728 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
729 {
730 mark_buffer_uptodate(bh, uptodate);
731 unlock_buffer(bh);
732 }
733
734 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
735 {
736 mark_buffer_uptodate(bh, uptodate);
737 unlock_buffer(bh);
738 BUG();
739 }
740
741 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
742 {
743 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
744 unsigned long flags;
745 struct buffer_head *tmp;
746 struct page *page;
747
748 mark_buffer_uptodate(bh, uptodate);
749
750 /* This is a temporary buffer used for page I/O. */
751 page = bh->b_page;
752
753 if (!uptodate)
754 SetPageError(page);
755
756 /*
757 * Be _very_ careful from here on. Bad things can happen if
758 * two buffer heads end IO at almost the same time and both
759 * decide that the page is now completely done.
760 *
761 * Async buffer_heads are here only as labels for IO, and get
762 * thrown away once the IO for this page is complete. IO is
763 * deemed complete once all buffers have been visited
764 * (b_count==0) and are now unlocked. We must make sure that
765 * only the _last_ buffer that decrements its count is the one
766 * that unlock the page..
767 */
768 spin_lock_irqsave(&page_uptodate_lock, flags);
769 unlock_buffer(bh);
770 atomic_dec(&bh->b_count);
771 tmp = bh->b_this_page;
772 while (tmp != bh) {
773 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
774 goto still_busy;
775 tmp = tmp->b_this_page;
776 }
777
778 /* OK, the async IO on this page is complete. */
779 spin_unlock_irqrestore(&page_uptodate_lock, flags);
780
781 /*
782 * if none of the buffers had errors then we can set the
783 * page uptodate:
784 */
785 if (!PageError(page))
786 SetPageUptodate(page);
787
788 /*
789 * Run the hooks that have to be done when a page I/O has completed.
790 */
791 if (test_and_clear_bit(PG_decr_after, &page->flags))
792 atomic_dec(&nr_async_pages);
793
794 UnlockPage(page);
795
796 return;
797
798 still_busy:
799 spin_unlock_irqrestore(&page_uptodate_lock, flags);
800 return;
801 }
802
803 /*
804 * Ok, this is getblk, and it isn't very clear, again to hinder
805 * race-conditions. Most of the code is seldom used, (ie repeating),
806 * so it should be much more efficient than it looks.
807 *
808 * The algorithm is changed: hopefully better, and an elusive bug removed.
809 *
810 * 14.02.92: changed it to sync dirty buffers a bit: better performance
811 * when the filesystem starts to get full of dirty blocks (I hope).
812 */
813 struct buffer_head * getblk(kdev_t dev, int block, int size)
814 {
815 struct buffer_head * bh;
816 int isize;
817
818 repeat:
819 bh = get_hash_table(dev, block, size);
820 if (bh)
821 goto out;
822
823 isize = BUFSIZE_INDEX(size);
824 spin_lock(&free_list[isize].lock);
825 bh = free_list[isize].list;
826 if (bh) {
827 __remove_from_free_list(bh, isize);
828 atomic_set(&bh->b_count, 1);
829 }
830 spin_unlock(&free_list[isize].lock);
831
832 /*
833 * OK, FINALLY we know that this buffer is the only one of
834 * its kind, we hold a reference (b_count>0), it is unlocked,
835 * and it is clean.
836 */
837 if (bh) {
838 init_buffer(bh, end_buffer_io_sync, NULL);
839 bh->b_dev = dev;
840 bh->b_blocknr = block;
841 bh->b_state = 1 << BH_Mapped;
842
843 /* Insert the buffer into the regular lists */
844 insert_into_queues(bh);
845 out:
846 touch_buffer(bh);
847 return bh;
848 }
849
850 /*
851 * If we block while refilling the free list, somebody may
852 * create the buffer first ... search the hashes again.
853 */
854 refill_freelist(size);
855 goto repeat;
856 }
857
858 /* -1 -> no need to flush
859 0 -> async flush
860 1 -> sync flush (wait for I/O completation) */
861 static int balance_dirty_state(kdev_t dev)
862 {
863 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
864
865 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
866 tot = nr_free_buffer_pages();
867 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
868
869 dirty *= 200;
870 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
871 hard_dirty_limit = soft_dirty_limit * 2;
872
873 if (dirty > soft_dirty_limit) {
874 if (dirty > hard_dirty_limit)
875 return 1;
876 return 0;
877 }
878 return -1;
879 }
880
881 /*
882 * if a new dirty buffer is created we need to balance bdflush.
883 *
884 * in the future we might want to make bdflush aware of different
885 * pressures on different devices - thus the (currently unused)
886 * 'dev' parameter.
887 */
888 void balance_dirty(kdev_t dev)
889 {
890 int state = balance_dirty_state(dev);
891
892 if (state < 0)
893 return;
894 wakeup_bdflush(state);
895 }
896
897 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
898 {
899 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
900 refile_buffer(bh);
901 }
902
903 /* atomic version, the user must call balance_dirty() by hand
904 as soon as it become possible to block */
905 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
906 {
907 if (!atomic_set_buffer_dirty(bh))
908 __mark_dirty(bh, flag);
909 }
910
911 void mark_buffer_dirty(struct buffer_head *bh, int flag)
912 {
913 __mark_buffer_dirty(bh, flag);
914 balance_dirty(bh->b_dev);
915 }
916
917 /*
918 * A buffer may need to be moved from one buffer list to another
919 * (e.g. in case it is not shared any more). Handle this.
920 */
921 static void __refile_buffer(struct buffer_head *bh)
922 {
923 int dispose = BUF_CLEAN;
924 if (buffer_locked(bh))
925 dispose = BUF_LOCKED;
926 if (buffer_dirty(bh))
927 dispose = BUF_DIRTY;
928 if (buffer_protected(bh))
929 dispose = BUF_PROTECTED;
930 if (dispose != bh->b_list) {
931 __remove_from_lru_list(bh, bh->b_list);
932 bh->b_list = dispose;
933 __insert_into_lru_list(bh, dispose);
934 }
935 }
936
937 void refile_buffer(struct buffer_head *bh)
938 {
939 spin_lock(&lru_list_lock);
940 __refile_buffer(bh);
941 spin_unlock(&lru_list_lock);
942 }
943
944 /*
945 * Release a buffer head
946 */
947 void __brelse(struct buffer_head * buf)
948 {
949 if (atomic_read(&buf->b_count)) {
950 atomic_dec(&buf->b_count);
951 return;
952 }
953 printk("VFS: brelse: Trying to free free buffer\n");
954 }
955
956 /*
957 * bforget() is like brelse(), except it puts the buffer on the
958 * free list if it can.. We can NOT free the buffer if:
959 * - there are other users of it
960 * - it is locked and thus can have active IO
961 */
962 void __bforget(struct buffer_head * buf)
963 {
964 /* grab the lru lock here to block bdflush. */
965 spin_lock(&lru_list_lock);
966 write_lock(&hash_table_lock);
967 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
968 goto in_use;
969 __hash_unlink(buf);
970 write_unlock(&hash_table_lock);
971 __remove_from_lru_list(buf, buf->b_list);
972 spin_unlock(&lru_list_lock);
973 put_last_free(buf);
974 return;
975
976 in_use:
977 write_unlock(&hash_table_lock);
978 spin_unlock(&lru_list_lock);
979 }
980
981 /*
982 * bread() reads a specified block and returns the buffer that contains
983 * it. It returns NULL if the block was unreadable.
984 */
985 struct buffer_head * bread(kdev_t dev, int block, int size)
986 {
987 struct buffer_head * bh;
988
989 bh = getblk(dev, block, size);
990 if (buffer_uptodate(bh))
991 return bh;
992 ll_rw_block(READ, 1, &bh);
993 wait_on_buffer(bh);
994 if (buffer_uptodate(bh))
995 return bh;
996 brelse(bh);
997 return NULL;
998 }
999
1000 /*
1001 * Ok, breada can be used as bread, but additionally to mark other
1002 * blocks for reading as well. End the argument list with a negative
1003 * number.
1004 */
1005
1006 #define NBUF 16
1007
1008 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1009 unsigned int pos, unsigned int filesize)
1010 {
1011 struct buffer_head * bhlist[NBUF];
1012 unsigned int blocks;
1013 struct buffer_head * bh;
1014 int index;
1015 int i, j;
1016
1017 if (pos >= filesize)
1018 return NULL;
1019
1020 if (block < 0)
1021 return NULL;
1022
1023 bh = getblk(dev, block, bufsize);
1024 index = BUFSIZE_INDEX(bh->b_size);
1025
1026 if (buffer_uptodate(bh))
1027 return(bh);
1028 else ll_rw_block(READ, 1, &bh);
1029
1030 blocks = (filesize - pos) >> (9+index);
1031
1032 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1033 blocks = read_ahead[MAJOR(dev)] >> index;
1034 if (blocks > NBUF)
1035 blocks = NBUF;
1036
1037 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1038
1039 bhlist[0] = bh;
1040 j = 1;
1041 for(i=1; i<blocks; i++) {
1042 bh = getblk(dev,block+i,bufsize);
1043 if (buffer_uptodate(bh)) {
1044 brelse(bh);
1045 break;
1046 }
1047 else bhlist[j++] = bh;
1048 }
1049
1050 /* Request the read for these buffers, and then release them. */
1051 if (j>1)
1052 ll_rw_block(READA, (j-1), bhlist+1);
1053 for(i=1; i<j; i++)
1054 brelse(bhlist[i]);
1055
1056 /* Wait for this buffer, and then continue on. */
1057 bh = bhlist[0];
1058 wait_on_buffer(bh);
1059 if (buffer_uptodate(bh))
1060 return bh;
1061 brelse(bh);
1062 return NULL;
1063 }
1064
1065 /*
1066 * Note: the caller should wake up the buffer_wait list if needed.
1067 */
1068 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1069 {
1070 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1071 kmem_cache_free(bh_cachep, bh);
1072 } else {
1073 bh->b_blocknr = -1;
1074 init_waitqueue_head(&bh->b_wait);
1075 nr_unused_buffer_heads++;
1076 bh->b_next_free = unused_list;
1077 bh->b_this_page = NULL;
1078 unused_list = bh;
1079 }
1080 }
1081
1082 /*
1083 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1084 * no-buffer-head deadlock. Return NULL on failure; waiting for
1085 * buffer heads is now handled in create_buffers().
1086 */
1087 static struct buffer_head * get_unused_buffer_head(int async)
1088 {
1089 struct buffer_head * bh;
1090
1091 spin_lock(&unused_list_lock);
1092 if (nr_unused_buffer_heads > NR_RESERVED) {
1093 bh = unused_list;
1094 unused_list = bh->b_next_free;
1095 nr_unused_buffer_heads--;
1096 spin_unlock(&unused_list_lock);
1097 return bh;
1098 }
1099 spin_unlock(&unused_list_lock);
1100
1101 /* This is critical. We can't swap out pages to get
1102 * more buffer heads, because the swap-out may need
1103 * more buffer-heads itself. Thus SLAB_BUFFER.
1104 */
1105 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1106 memset(bh, 0, sizeof(*bh));
1107 init_waitqueue_head(&bh->b_wait);
1108 return bh;
1109 }
1110
1111 /*
1112 * If we need an async buffer, use the reserved buffer heads.
1113 */
1114 if (async) {
1115 spin_lock(&unused_list_lock);
1116 if (unused_list) {
1117 bh = unused_list;
1118 unused_list = bh->b_next_free;
1119 nr_unused_buffer_heads--;
1120 spin_unlock(&unused_list_lock);
1121 return bh;
1122 }
1123 spin_unlock(&unused_list_lock);
1124 }
1125 #if 0
1126 /*
1127 * (Pending further analysis ...)
1128 * Ordinary (non-async) requests can use a different memory priority
1129 * to free up pages. Any swapping thus generated will use async
1130 * buffer heads.
1131 */
1132 if(!async &&
1133 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1134 memset(bh, 0, sizeof(*bh));
1135 init_waitqueue_head(&bh->b_wait);
1136 return bh;
1137 }
1138 #endif
1139
1140 return NULL;
1141 }
1142
1143 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1144 {
1145 bh->b_page = page;
1146 if (offset >= PAGE_SIZE)
1147 BUG();
1148 if (PageHighMem(page))
1149 /*
1150 * This catches illegal uses and preserves the offset:
1151 */
1152 bh->b_data = (char *)(0 + offset);
1153 else
1154 bh->b_data = (char *)(page_address(page) + offset);
1155 }
1156
1157 /*
1158 * Create the appropriate buffers when given a page for data area and
1159 * the size of each buffer.. Use the bh->b_this_page linked list to
1160 * follow the buffers created. Return NULL if unable to create more
1161 * buffers.
1162 * The async flag is used to differentiate async IO (paging, swapping)
1163 * from ordinary buffer allocations, and only async requests are allowed
1164 * to sleep waiting for buffer heads.
1165 */
1166 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1167 {
1168 struct buffer_head *bh, *head;
1169 long offset;
1170
1171 try_again:
1172 head = NULL;
1173 offset = PAGE_SIZE;
1174 while ((offset -= size) >= 0) {
1175 bh = get_unused_buffer_head(async);
1176 if (!bh)
1177 goto no_grow;
1178
1179 bh->b_dev = B_FREE; /* Flag as unused */
1180 bh->b_this_page = head;
1181 head = bh;
1182
1183 bh->b_state = 0;
1184 bh->b_next_free = NULL;
1185 bh->b_pprev = NULL;
1186 atomic_set(&bh->b_count, 0);
1187 bh->b_size = size;
1188
1189 set_bh_page(bh, page, offset);
1190
1191 bh->b_list = BUF_CLEAN;
1192 bh->b_end_io = end_buffer_io_bad;
1193 }
1194 return head;
1195 /*
1196 * In case anything failed, we just free everything we got.
1197 */
1198 no_grow:
1199 if (head) {
1200 spin_lock(&unused_list_lock);
1201 do {
1202 bh = head;
1203 head = head->b_this_page;
1204 __put_unused_buffer_head(bh);
1205 } while (head);
1206 spin_unlock(&unused_list_lock);
1207
1208 /* Wake up any waiters ... */
1209 wake_up(&buffer_wait);
1210 }
1211
1212 /*
1213 * Return failure for non-async IO requests. Async IO requests
1214 * are not allowed to fail, so we have to wait until buffer heads
1215 * become available. But we don't want tasks sleeping with
1216 * partially complete buffers, so all were released above.
1217 */
1218 if (!async)
1219 return NULL;
1220
1221 /* We're _really_ low on memory. Now we just
1222 * wait for old buffer heads to become free due to
1223 * finishing IO. Since this is an async request and
1224 * the reserve list is empty, we're sure there are
1225 * async buffer heads in use.
1226 */
1227 run_task_queue(&tq_disk);
1228
1229 /*
1230 * Set our state for sleeping, then check again for buffer heads.
1231 * This ensures we won't miss a wake_up from an interrupt.
1232 */
1233 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1234 goto try_again;
1235 }
1236
1237 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1238 {
1239 struct buffer_head *head, *bh, *tail;
1240 int block;
1241
1242 if (!PageLocked(page))
1243 BUG();
1244 /*
1245 * Allocate async buffer heads pointing to this page, just for I/O.
1246 * They don't show up in the buffer hash table, but they *are*
1247 * registered in page->buffers.
1248 */
1249 head = create_buffers(page, size, 1);
1250 if (page->buffers)
1251 BUG();
1252 if (!head)
1253 BUG();
1254 tail = head;
1255 for (bh = head; bh; bh = bh->b_this_page) {
1256 block = *(b++);
1257
1258 tail = bh;
1259 init_buffer(bh, end_buffer_io_async, NULL);
1260 bh->b_dev = dev;
1261 bh->b_blocknr = block;
1262
1263 set_bit(BH_Mapped, &bh->b_state);
1264 }
1265 tail->b_this_page = head;
1266 get_page(page);
1267 page->buffers = head;
1268 return 0;
1269 }
1270
1271 static void unmap_buffer(struct buffer_head * bh)
1272 {
1273 if (buffer_mapped(bh)) {
1274 mark_buffer_clean(bh);
1275 wait_on_buffer(bh);
1276 clear_bit(BH_Uptodate, &bh->b_state);
1277 clear_bit(BH_Mapped, &bh->b_state);
1278 clear_bit(BH_Req, &bh->b_state);
1279 clear_bit(BH_New, &bh->b_state);
1280 }
1281 }
1282
1283 /*
1284 * We don't have to release all buffers here, but
1285 * we have to be sure that no dirty buffer is left
1286 * and no IO is going on (no buffer is locked), because
1287 * we have truncated the file and are going to free the
1288 * blocks on-disk..
1289 */
1290 int block_flushpage(struct page *page, unsigned long offset)
1291 {
1292 struct buffer_head *head, *bh, *next;
1293 unsigned int curr_off = 0;
1294
1295 if (!PageLocked(page))
1296 BUG();
1297 if (!page->buffers)
1298 return 1;
1299
1300 head = page->buffers;
1301 bh = head;
1302 do {
1303 unsigned int next_off = curr_off + bh->b_size;
1304 next = bh->b_this_page;
1305
1306 /*
1307 * is this block fully flushed?
1308 */
1309 if (offset <= curr_off)
1310 unmap_buffer(bh);
1311 curr_off = next_off;
1312 bh = next;
1313 } while (bh != head);
1314
1315 /*
1316 * subtle. We release buffer-heads only if this is
1317 * the 'final' flushpage. We have invalidated the get_block
1318 * cached value unconditionally, so real IO is not
1319 * possible anymore.
1320 *
1321 * If the free doesn't work out, the buffers can be
1322 * left around - they just turn into anonymous buffers
1323 * instead.
1324 */
1325 if (!offset) {
1326 if (!try_to_free_buffers(page)) {
1327 atomic_inc(&buffermem_pages);
1328 return 0;
1329 }
1330 }
1331
1332 return 1;
1333 }
1334
1335 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1336 {
1337 struct buffer_head *bh, *head, *tail;
1338
1339 head = create_buffers(page, blocksize, 1);
1340 if (page->buffers)
1341 BUG();
1342
1343 bh = head;
1344 do {
1345 bh->b_dev = inode->i_dev;
1346 bh->b_blocknr = 0;
1347 bh->b_end_io = end_buffer_io_bad;
1348 tail = bh;
1349 bh = bh->b_this_page;
1350 } while (bh);
1351 tail->b_this_page = head;
1352 page->buffers = head;
1353 get_page(page);
1354 }
1355
1356 static void unmap_underlying_metadata(struct buffer_head * bh)
1357 {
1358 struct buffer_head *old_bh;
1359
1360 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1361 if (old_bh) {
1362 unmap_buffer(old_bh);
1363 /* Here we could run brelse or bforget. We use
1364 bforget because it will try to put the buffer
1365 in the freelist. */
1366 __bforget(old_bh);
1367 }
1368 }
1369
1370 /*
1371 * block_write_full_page() is SMP-safe - currently it's still
1372 * being called with the kernel lock held, but the code is ready.
1373 */
1374 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1375 {
1376 int err, i, need_balance_dirty = 0;
1377 unsigned long block;
1378 struct buffer_head *bh, *head;
1379
1380 if (!PageLocked(page))
1381 BUG();
1382
1383 if (!page->buffers)
1384 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1385 head = page->buffers;
1386
1387 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1388
1389 bh = head;
1390 i = 0;
1391 do {
1392 /*
1393 * If the buffer isn't up-to-date, we can't be sure
1394 * that the buffer has been initialized with the proper
1395 * block number information etc..
1396 *
1397 * Leave it to the low-level FS to make all those
1398 * decisions (block #0 may actually be a valid block)
1399 */
1400 bh->b_end_io = end_buffer_io_sync;
1401 if (!buffer_mapped(bh)) {
1402 err = get_block(inode, block, bh, 1);
1403 if (err)
1404 goto out;
1405 if (buffer_new(bh))
1406 unmap_underlying_metadata(bh);
1407 }
1408 set_bit(BH_Uptodate, &bh->b_state);
1409 if (!atomic_set_buffer_dirty(bh)) {
1410 __mark_dirty(bh, 0);
1411 need_balance_dirty = 1;
1412 }
1413
1414 bh = bh->b_this_page;
1415 block++;
1416 } while (bh != head);
1417
1418 if (need_balance_dirty)
1419 balance_dirty(bh->b_dev);
1420
1421 SetPageUptodate(page);
1422 return 0;
1423 out:
1424 ClearPageUptodate(page);
1425 return err;
1426 }
1427
1428 static int __block_prepare_write(struct inode *inode, struct page *page,
1429 unsigned from, unsigned to, get_block_t *get_block)
1430 {
1431 unsigned block_start, block_end;
1432 unsigned long block;
1433 int err = 0;
1434 unsigned blocksize, bbits;
1435 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1436 char *kaddr = (char *)kmap(page);
1437
1438 blocksize = inode->i_sb->s_blocksize;
1439 if (!page->buffers)
1440 create_empty_buffers(page, inode, blocksize);
1441 head = page->buffers;
1442
1443 bbits = inode->i_sb->s_blocksize_bits;
1444 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1445
1446 for(bh = head, block_start = 0; bh != head || !block_start;
1447 block++, block_start=block_end, bh = bh->b_this_page) {
1448 if (!bh)
1449 BUG();
1450 block_end = block_start+blocksize;
1451 if (block_end <= from)
1452 continue;
1453 if (block_start >= to)
1454 break;
1455 bh->b_end_io = end_buffer_io_sync;
1456 if (!buffer_mapped(bh)) {
1457 err = get_block(inode, block, bh, 1);
1458 if (err)
1459 goto out;
1460 if (buffer_new(bh)) {
1461 unmap_underlying_metadata(bh);
1462 if (block_end > to)
1463 memset(kaddr+to, 0, block_end-to);
1464 if (block_start < from)
1465 memset(kaddr+block_start, 0, from-block_start);
1466 continue;
1467 }
1468 }
1469 if (!buffer_uptodate(bh) &&
1470 (block_start < from || block_end > to)) {
1471 ll_rw_block(READ, 1, &bh);
1472 *wait_bh++=bh;
1473 }
1474 }
1475 /*
1476 * If we issued read requests - let them complete.
1477 */
1478 while(wait_bh > wait) {
1479 wait_on_buffer(*--wait_bh);
1480 err = -EIO;
1481 if (!buffer_uptodate(*wait_bh))
1482 goto out;
1483 }
1484 return 0;
1485 out:
1486 return err;
1487 }
1488
1489 static int __block_commit_write(struct inode *inode, struct page *page,
1490 unsigned from, unsigned to)
1491 {
1492 unsigned block_start, block_end;
1493 int partial = 0, need_balance_dirty = 0;
1494 unsigned blocksize;
1495 struct buffer_head *bh, *head;
1496
1497 blocksize = inode->i_sb->s_blocksize;
1498
1499 for(bh = head = page->buffers, block_start = 0;
1500 bh != head || !block_start;
1501 block_start=block_end, bh = bh->b_this_page) {
1502 block_end = block_start + blocksize;
1503 if (block_end <= from || block_start >= to) {
1504 if (!buffer_uptodate(bh))
1505 partial = 1;
1506 } else {
1507 set_bit(BH_Uptodate, &bh->b_state);
1508 if (!atomic_set_buffer_dirty(bh)) {
1509 __mark_dirty(bh, 0);
1510 need_balance_dirty = 1;
1511 }
1512 }
1513 }
1514
1515 if (need_balance_dirty)
1516 balance_dirty(bh->b_dev);
1517 /*
1518 * is this a partial write that happened to make all buffers
1519 * uptodate then we can optimize away a bogus readpage() for
1520 * the next read(). Here we 'discover' wether the page went
1521 * uptodate as a result of this (potentially partial) write.
1522 */
1523 if (!partial)
1524 SetPageUptodate(page);
1525 return 0;
1526 }
1527
1528 /*
1529 * Generic "read page" function for block devices that have the normal
1530 * get_block functionality. This is most of the block device filesystems.
1531 * Reads the page asynchronously --- the unlock_buffer() and
1532 * mark_buffer_uptodate() functions propagate buffer state into the
1533 * page struct once IO has completed.
1534 */
1535 static inline int __block_read_full_page(struct inode *inode, struct page *page,
1536 get_block_t *get_block)
1537 {
1538 unsigned long iblock;
1539 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1540 unsigned int blocksize, blocks;
1541 unsigned long kaddr = 0;
1542 int nr, i;
1543
1544 if (!PageLocked(page))
1545 PAGE_BUG(page);
1546 blocksize = inode->i_sb->s_blocksize;
1547 if (!page->buffers)
1548 create_empty_buffers(page, inode, blocksize);
1549 head = page->buffers;
1550
1551 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1552 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1553 bh = head;
1554 nr = 0;
1555 i = 0;
1556
1557 do {
1558 if (buffer_uptodate(bh))
1559 continue;
1560
1561 if (!buffer_mapped(bh)) {
1562 get_block(inode, iblock, bh, 0);
1563 if (!buffer_mapped(bh)) {
1564 if (!kaddr)
1565 kaddr = kmap(page);
1566 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1567 set_bit(BH_Uptodate, &bh->b_state);
1568 continue;
1569 }
1570 }
1571
1572 init_buffer(bh, end_buffer_io_async, NULL);
1573 atomic_inc(&bh->b_count);
1574 arr[nr] = bh;
1575 nr++;
1576 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1577
1578 ++current->maj_flt;
1579 if (nr) {
1580 if (Page_Uptodate(page))
1581 BUG();
1582 ll_rw_block(READ, nr, arr);
1583 } else {
1584 /*
1585 * all buffers are uptodate - we can set the page
1586 * uptodate as well.
1587 */
1588 SetPageUptodate(page);
1589 UnlockPage(page);
1590 }
1591 if (kaddr)
1592 kunmap(page);
1593 return 0;
1594 }
1595
1596 /*
1597 * For moronic filesystems that do not allow holes in file.
1598 * We may have to extend the file.
1599 */
1600
1601 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1602 {
1603 struct address_space *mapping = page->mapping;
1604 struct inode *inode = (struct inode*)mapping->host;
1605 struct page *new_page;
1606 unsigned long pgpos;
1607 long status;
1608 unsigned zerofrom;
1609 unsigned blocksize = inode->i_sb->s_blocksize;
1610 char *kaddr;
1611
1612 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1613 status = -ENOMEM;
1614 new_page = grab_cache_page(mapping, pgpos);
1615 if (!new_page)
1616 goto out;
1617 /* we might sleep */
1618 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1619 UnlockPage(new_page);
1620 page_cache_release(new_page);
1621 continue;
1622 }
1623 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1624 if (zerofrom & (blocksize-1)) {
1625 *bytes |= (blocksize-1);
1626 (*bytes)++;
1627 }
1628 status = __block_prepare_write(inode, new_page, zerofrom,
1629 PAGE_CACHE_SIZE, get_block);
1630 if (status)
1631 goto out_unmap;
1632 kaddr = (char*)page_address(page);
1633 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1634 __block_commit_write(inode, new_page, zerofrom, to);
1635 kunmap(new_page);
1636 UnlockPage(new_page);
1637 page_cache_release(new_page);
1638 }
1639
1640 if (page->index < pgpos) {
1641 /* completely inside the area */
1642 zerofrom = offset;
1643 } else {
1644 /* page covers the boundary, find the boundary offset */
1645 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1646
1647 /* if we will expand the thing last block will be filled */
1648 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1649 *bytes |= (blocksize-1);
1650 (*bytes)++;
1651 }
1652
1653 /* starting below the boundary? Nothing to zero out */
1654 if (offset <= zerofrom)
1655 zerofrom = offset;
1656 }
1657 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1658 if (status)
1659 goto out1;
1660 kaddr = (char*)page_address(page);
1661 if (zerofrom < offset) {
1662 memset(kaddr+zerofrom, 0, offset-zerofrom);
1663 __block_commit_write(inode, page, zerofrom, offset);
1664 }
1665 return 0;
1666 out1:
1667 ClearPageUptodate(page);
1668 kunmap(page);
1669 return status;
1670
1671 out_unmap:
1672 ClearPageUptodate(new_page);
1673 kunmap(new_page);
1674 UnlockPage(new_page);
1675 page_cache_release(new_page);
1676 out:
1677 return status;
1678 }
1679
1680 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1681 get_block_t *get_block)
1682 {
1683 struct inode *inode = (struct inode*)page->mapping->host;
1684 int err = __block_prepare_write(inode, page, from, to, get_block);
1685 if (err) {
1686 ClearPageUptodate(page);
1687 kunmap(page);
1688 }
1689 return err;
1690 }
1691
1692 int generic_commit_write(struct file *file, struct page *page,
1693 unsigned from, unsigned to)
1694 {
1695 __block_commit_write((struct inode*)page->mapping->host,page,from,to);
1696 kunmap(page);
1697 return 0;
1698 }
1699
1700 int block_write_full_page(struct page *page, get_block_t *get_block)
1701 {
1702 struct inode *inode = (struct inode*)page->mapping->host;
1703 return __block_write_full_page(inode, page, get_block);
1704 }
1705
1706 int block_read_full_page(struct page *page, get_block_t *get_block)
1707 {
1708 struct inode *inode = (struct inode*)page->mapping->host;
1709 return __block_read_full_page(inode, page, get_block);
1710 }
1711
1712 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1713 {
1714 struct buffer_head tmp;
1715 struct inode *inode = (struct inode*)mapping->host;
1716 tmp.b_state = 0;
1717 tmp.b_blocknr = 0;
1718 get_block(inode, block, &tmp, 0);
1719 return tmp.b_blocknr;
1720 }
1721
1722 /*
1723 * IO completion routine for a buffer_head being used for kiobuf IO: we
1724 * can't dispatch the kiobuf callback until io_count reaches 0.
1725 */
1726
1727 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1728 {
1729 struct kiobuf *kiobuf;
1730
1731 mark_buffer_uptodate(bh, uptodate);
1732
1733 kiobuf = bh->b_kiobuf;
1734 if (atomic_dec_and_test(&kiobuf->io_count))
1735 kiobuf->end_io(kiobuf);
1736 if (!uptodate)
1737 kiobuf->errno = -EIO;
1738 }
1739
1740
1741 /*
1742 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1743 * for them to complete. Clean up the buffer_heads afterwards.
1744 */
1745
1746 static int do_kio(struct kiobuf *kiobuf,
1747 int rw, int nr, struct buffer_head *bh[], int size)
1748 {
1749 int iosize;
1750 int i;
1751 struct buffer_head *tmp;
1752
1753 struct task_struct *tsk = current;
1754 DECLARE_WAITQUEUE(wait, tsk);
1755
1756 if (rw == WRITE)
1757 rw = WRITERAW;
1758 atomic_add(nr, &kiobuf->io_count);
1759 kiobuf->errno = 0;
1760 ll_rw_block(rw, nr, bh);
1761
1762 kiobuf_wait_for_io(kiobuf);
1763
1764 spin_lock(&unused_list_lock);
1765
1766 iosize = 0;
1767 for (i = nr; --i >= 0; ) {
1768 iosize += size;
1769 tmp = bh[i];
1770 if (!buffer_uptodate(tmp)) {
1771 /* We are traversing bh'es in reverse order so
1772 clearing iosize on error calculates the
1773 amount of IO before the first error. */
1774 iosize = 0;
1775 }
1776 __put_unused_buffer_head(tmp);
1777 }
1778
1779 spin_unlock(&unused_list_lock);
1780
1781 if (iosize)
1782 return iosize;
1783 if (kiobuf->errno)
1784 return kiobuf->errno;
1785 return -EIO;
1786 }
1787
1788 /*
1789 * Start I/O on a physical range of kernel memory, defined by a vector
1790 * of kiobuf structs (much like a user-space iovec list).
1791 *
1792 * The kiobuf must already be locked for IO. IO is submitted
1793 * asynchronously: you need to check page->locked, page->uptodate, and
1794 * maybe wait on page->wait.
1795 *
1796 * It is up to the caller to make sure that there are enough blocks
1797 * passed in to completely map the iobufs to disk.
1798 */
1799
1800 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1801 kdev_t dev, unsigned long b[], int size)
1802 {
1803 int err;
1804 int length;
1805 int transferred;
1806 int i;
1807 int bufind;
1808 int pageind;
1809 int bhind;
1810 int offset;
1811 unsigned long blocknr;
1812 struct kiobuf * iobuf = NULL;
1813 struct page * map;
1814 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1815
1816 if (!nr)
1817 return 0;
1818
1819 /*
1820 * First, do some alignment and validity checks
1821 */
1822 for (i = 0; i < nr; i++) {
1823 iobuf = iovec[i];
1824 if ((iobuf->offset & (size-1)) ||
1825 (iobuf->length & (size-1)))
1826 return -EINVAL;
1827 if (!iobuf->locked)
1828 panic("brw_kiovec: iobuf not locked for I/O");
1829 if (!iobuf->nr_pages)
1830 panic("brw_kiovec: iobuf not initialised");
1831 }
1832
1833 /*
1834 * OK to walk down the iovec doing page IO on each page we find.
1835 */
1836 bufind = bhind = transferred = err = 0;
1837 for (i = 0; i < nr; i++) {
1838 iobuf = iovec[i];
1839 offset = iobuf->offset;
1840 length = iobuf->length;
1841
1842 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1843 map = iobuf->maplist[pageind];
1844
1845 while (length > 0) {
1846 blocknr = b[bufind++];
1847 tmp = get_unused_buffer_head(0);
1848 if (!tmp) {
1849 err = -ENOMEM;
1850 goto error;
1851 }
1852
1853 tmp->b_dev = B_FREE;
1854 tmp->b_size = size;
1855 set_bh_page(tmp, map, offset);
1856 tmp->b_this_page = tmp;
1857
1858 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1859 tmp->b_dev = dev;
1860 tmp->b_blocknr = blocknr;
1861 tmp->b_state = 1 << BH_Mapped;
1862 tmp->b_kiobuf = iobuf;
1863
1864 if (rw == WRITE) {
1865 set_bit(BH_Uptodate, &tmp->b_state);
1866 set_bit(BH_Dirty, &tmp->b_state);
1867 }
1868
1869 bh[bhind++] = tmp;
1870 length -= size;
1871 offset += size;
1872
1873 /*
1874 * Start the IO if we have got too much
1875 */
1876 if (bhind >= KIO_MAX_SECTORS) {
1877 err = do_kio(iobuf, rw, bhind, bh, size);
1878 if (err >= 0)
1879 transferred += err;
1880 else
1881 goto finished;
1882 bhind = 0;
1883 }
1884
1885 if (offset >= PAGE_SIZE) {
1886 offset = 0;
1887 break;
1888 }
1889 } /* End of block loop */
1890 } /* End of page loop */
1891 } /* End of iovec loop */
1892
1893 /* Is there any IO still left to submit? */
1894 if (bhind) {
1895 err = do_kio(iobuf, rw, bhind, bh, size);
1896 if (err >= 0)
1897 transferred += err;
1898 else
1899 goto finished;
1900 }
1901
1902 finished:
1903 if (transferred)
1904 return transferred;
1905 return err;
1906
1907 error:
1908 /* We got an error allocating the bh'es. Just free the current
1909 buffer_heads and exit. */
1910 spin_lock(&unused_list_lock);
1911 for (i = bhind; --i >= 0; ) {
1912 __put_unused_buffer_head(bh[bhind]);
1913 }
1914 spin_unlock(&unused_list_lock);
1915 goto finished;
1916 }
1917
1918 /*
1919 * Start I/O on a page.
1920 * This function expects the page to be locked and may return
1921 * before I/O is complete. You then have to check page->locked,
1922 * page->uptodate, and maybe wait on page->wait.
1923 *
1924 * brw_page() is SMP-safe, although it's being called with the
1925 * kernel lock held - but the code is ready.
1926 *
1927 * FIXME: we need a swapper_inode->get_block function to remove
1928 * some of the bmap kludges and interface ugliness here.
1929 */
1930 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1931 {
1932 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1933 int nr, fresh /* temporary debugging flag */, block;
1934
1935 if (!PageLocked(page))
1936 panic("brw_page: page not locked for I/O");
1937 // clear_bit(PG_error, &page->flags);
1938 /*
1939 * We pretty much rely on the page lock for this, because
1940 * create_page_buffers() might sleep.
1941 */
1942 fresh = 0;
1943 if (!page->buffers) {
1944 create_page_buffers(rw, page, dev, b, size);
1945 fresh = 1;
1946 }
1947 if (!page->buffers)
1948 BUG();
1949
1950 head = page->buffers;
1951 bh = head;
1952 nr = 0;
1953 do {
1954 block = *(b++);
1955
1956 if (fresh && (atomic_read(&bh->b_count) != 0))
1957 BUG();
1958 if (rw == READ) {
1959 if (!fresh)
1960 BUG();
1961 if (!buffer_uptodate(bh)) {
1962 arr[nr++] = bh;
1963 atomic_inc(&bh->b_count);
1964 }
1965 } else { /* WRITE */
1966 if (!bh->b_blocknr) {
1967 if (!block)
1968 BUG();
1969 bh->b_blocknr = block;
1970 } else {
1971 if (!block)
1972 BUG();
1973 }
1974 set_bit(BH_Uptodate, &bh->b_state);
1975 set_bit(BH_Dirty, &bh->b_state);
1976 arr[nr++] = bh;
1977 atomic_inc(&bh->b_count);
1978 }
1979 bh = bh->b_this_page;
1980 } while (bh != head);
1981 if (rw == READ)
1982 ++current->maj_flt;
1983 if ((rw == READ) && nr) {
1984 if (Page_Uptodate(page))
1985 BUG();
1986 ll_rw_block(rw, nr, arr);
1987 } else {
1988 if (!nr && rw == READ) {
1989 SetPageUptodate(page);
1990 UnlockPage(page);
1991 }
1992 if (nr && (rw == WRITE))
1993 ll_rw_block(rw, nr, arr);
1994 }
1995 return 0;
1996 }
1997
1998 int block_symlink(struct inode *inode, const char *symname, int len)
1999 {
2000 struct address_space *mapping = inode->i_mapping;
2001 struct page *page = grab_cache_page(mapping, 0);
2002 int err = -ENOMEM;
2003 char *kaddr;
2004
2005 if (!page)
2006 goto fail;
2007 err = mapping->a_ops->prepare_write(page, 0, len-1);
2008 if (err)
2009 goto fail_map;
2010 kaddr = (char*)page_address(page);
2011 memcpy(kaddr, symname, len-1);
2012 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2013 inode->i_size = len-1;
2014 /*
2015 * Notice that we are _not_ going to block here - end of page is
2016 * unmapped, so this will only try to map the rest of page, see
2017 * that it is unmapped (typically even will not look into inode -
2018 * ->i_size will be enough for everything) and zero it out.
2019 * OTOH it's obviously correct and should make the page up-to-date.
2020 */
2021 err = mapping->a_ops->readpage(NULL, page);
2022 wait_on_page(page);
2023 page_cache_release(page);
2024 if (err < 0)
2025 goto fail;
2026 mark_inode_dirty(inode);
2027 return 0;
2028 fail_map:
2029 inode->i_size = len-1;
2030 UnlockPage(page);
2031 page_cache_release(page);
2032 fail:
2033 return err;
2034 }
2035
2036 /*
2037 * Try to increase the number of buffers available: the size argument
2038 * is used to determine what kind of buffers we want.
2039 */
2040 static int grow_buffers(int size)
2041 {
2042 struct page * page;
2043 struct buffer_head *bh, *tmp;
2044 struct buffer_head * insert_point;
2045 int isize;
2046
2047 if ((size & 511) || (size > PAGE_SIZE)) {
2048 printk("VFS: grow_buffers: size = %d\n",size);
2049 return 0;
2050 }
2051
2052 page = alloc_page(GFP_BUFFER);
2053 if (!page)
2054 goto out;
2055 bh = create_buffers(page, size, 0);
2056 if (!bh)
2057 goto no_buffer_head;
2058
2059 isize = BUFSIZE_INDEX(size);
2060
2061 spin_lock(&free_list[isize].lock);
2062 insert_point = free_list[isize].list;
2063 tmp = bh;
2064 while (1) {
2065 if (insert_point) {
2066 tmp->b_next_free = insert_point->b_next_free;
2067 tmp->b_prev_free = insert_point;
2068 insert_point->b_next_free->b_prev_free = tmp;
2069 insert_point->b_next_free = tmp;
2070 } else {
2071 tmp->b_prev_free = tmp;
2072 tmp->b_next_free = tmp;
2073 }
2074 insert_point = tmp;
2075 if (tmp->b_this_page)
2076 tmp = tmp->b_this_page;
2077 else
2078 break;
2079 }
2080 tmp->b_this_page = bh;
2081 free_list[isize].list = bh;
2082 spin_unlock(&free_list[isize].lock);
2083
2084 page->buffers = bh;
2085 lru_cache_add(page);
2086 atomic_inc(&buffermem_pages);
2087 return 1;
2088
2089 no_buffer_head:
2090 __free_page(page);
2091 out:
2092 return 0;
2093 }
2094
2095 /*
2096 * Can the buffer be thrown out?
2097 */
2098 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2099 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2100
2101 /*
2102 * try_to_free_buffers() checks if all the buffers on this particular page
2103 * are unused, and free's the page if so.
2104 *
2105 * Wake up bdflush() if this fails - if we're running low on memory due
2106 * to dirty buffers, we need to flush them out as quickly as possible.
2107 *
2108 * NOTE: There are quite a number of ways that threads of control can
2109 * obtain a reference to a buffer head within a page. So we must
2110 * lock out all of these paths to cleanly toss the page.
2111 */
2112 int try_to_free_buffers(struct page * page)
2113 {
2114 struct buffer_head * tmp, * p, * bh = page->buffers;
2115 int index = BUFSIZE_INDEX(bh->b_size);
2116 int ret;
2117
2118 spin_lock(&lru_list_lock);
2119 write_lock(&hash_table_lock);
2120 spin_lock(&free_list[index].lock);
2121 tmp = bh;
2122 do {
2123 p = tmp;
2124
2125 tmp = tmp->b_this_page;
2126 if (buffer_busy(p))
2127 goto busy_buffer_page;
2128 } while (tmp != bh);
2129
2130 spin_lock(&unused_list_lock);
2131 tmp = bh;
2132 do {
2133 struct buffer_head * p = tmp;
2134 tmp = tmp->b_this_page;
2135
2136 /* The buffer can be either on the regular
2137 * queues or on the free list..
2138 */
2139 if (p->b_dev != B_FREE)
2140 __remove_from_queues(p);
2141 else
2142 __remove_from_free_list(p, index);
2143 __put_unused_buffer_head(p);
2144 } while (tmp != bh);
2145 spin_unlock(&unused_list_lock);
2146
2147 /* Wake up anyone waiting for buffer heads */
2148 wake_up(&buffer_wait);
2149
2150 /* And free the page */
2151 page->buffers = NULL;
2152 __free_page(page);
2153 ret = 1;
2154 out:
2155 spin_unlock(&free_list[index].lock);
2156 write_unlock(&hash_table_lock);
2157 spin_unlock(&lru_list_lock);
2158 return ret;
2159
2160 busy_buffer_page:
2161 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2162 if (buffer_dirty(p))
2163 wakeup_bdflush(0);
2164 ret = 0;
2165 goto out;
2166 }
2167
2168 /* ================== Debugging =================== */
2169
2170 void show_buffers(void)
2171 {
2172 #ifdef __SMP__
2173 struct buffer_head * bh;
2174 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2175 int protected = 0;
2176 int nlist;
2177 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2178 #endif
2179
2180 printk("Buffer memory: %6dkB\n",
2181 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2182
2183 #ifdef __SMP__ /* trylock does nothing on UP and so we could deadlock */
2184 if (!spin_trylock(&lru_list_lock))
2185 return;
2186 for(nlist = 0; nlist < NR_LIST; nlist++) {
2187 found = locked = dirty = used = lastused = protected = 0;
2188 bh = lru_list[nlist];
2189 if(!bh) continue;
2190
2191 do {
2192 found++;
2193 if (buffer_locked(bh))
2194 locked++;
2195 if (buffer_protected(bh))
2196 protected++;
2197 if (buffer_dirty(bh))
2198 dirty++;
2199 if (atomic_read(&bh->b_count))
2200 used++, lastused = found;
2201 bh = bh->b_next_free;
2202 } while (bh != lru_list[nlist]);
2203 {
2204 int tmp = nr_buffers_type[nlist];
2205 if (found != tmp)
2206 printk("%9s: BUG -> found %d, reported %d\n",
2207 buf_types[nlist], found, tmp);
2208 }
2209 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2210 "%d locked, %d protected, %d dirty\n",
2211 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2212 used, lastused, locked, protected, dirty);
2213 }
2214 spin_unlock(&lru_list_lock);
2215 #endif
2216 }
2217
2218 /* ===================== Init ======================= */
2219
2220 /*
2221 * allocate the hash table and init the free list
2222 * Use gfp() for the hash table to decrease TLB misses, use
2223 * SLAB cache for buffer heads.
2224 */
2225 void __init buffer_init(unsigned long mempages)
2226 {
2227 int order, i;
2228 unsigned int nr_hash;
2229
2230 /* The buffer cache hash table is less important these days,
2231 * trim it a bit.
2232 */
2233 mempages >>= 14;
2234
2235 mempages *= sizeof(struct buffer_head *);
2236
2237 for (order = 0; (1 << order) < mempages; order++)
2238 ;
2239
2240 /* try to allocate something until we get it or we're asking
2241 for something that is really too small */
2242
2243 do {
2244 unsigned long tmp;
2245
2246 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2247 bh_hash_mask = (nr_hash - 1);
2248
2249 tmp = nr_hash;
2250 bh_hash_shift = 0;
2251 while((tmp >>= 1UL) != 0UL)
2252 bh_hash_shift++;
2253
2254 hash_table = (struct buffer_head **)
2255 __get_free_pages(GFP_ATOMIC, order);
2256 } while (hash_table == NULL && --order > 0);
2257 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2258 nr_hash, order, (1UL<<order) * PAGE_SIZE);
2259
2260 if (!hash_table)
2261 panic("Failed to allocate buffer hash table\n");
2262
2263 /* Setup hash chains. */
2264 for(i = 0; i < nr_hash; i++)
2265 hash_table[i] = NULL;
2266
2267 /* Setup free lists. */
2268 for(i = 0; i < NR_SIZES; i++) {
2269 free_list[i].list = NULL;
2270 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2271 }
2272
2273 /* Setup lru lists. */
2274 for(i = 0; i < NR_LIST; i++)
2275 lru_list[i] = NULL;
2276
2277 bh_cachep = kmem_cache_create("buffer_head",
2278 sizeof(struct buffer_head),
2279 0,
2280 SLAB_HWCACHE_ALIGN, NULL, NULL);
2281 if(!bh_cachep)
2282 panic("Cannot create buffer head SLAB cache\n");
2283 }
2284
2285
2286 /* ====================== bdflush support =================== */
2287
2288 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2289 * response to dirty buffers. Once this process is activated, we write back
2290 * a limited number of buffers to the disks and then go back to sleep again.
2291 */
2292 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2293 struct task_struct *bdflush_tsk = 0;
2294
2295 void wakeup_bdflush(int block)
2296 {
2297 DECLARE_WAITQUEUE(wait, current);
2298
2299 if (current == bdflush_tsk)
2300 return;
2301
2302 if (!block) {
2303 if(bdflush_tsk != NULL) wake_up_process(bdflush_tsk);
2304 return;
2305 }
2306
2307 /* kflushd can wakeup us before we have a chance to
2308 go to sleep so we must be smart in handling
2309 this wakeup event from kflushd to avoid deadlocking in SMP
2310 (we are not holding any lock anymore in these two paths). */
2311 __set_current_state(TASK_UNINTERRUPTIBLE);
2312 add_wait_queue(&bdflush_done, &wait);
2313
2314 wake_up_process(bdflush_tsk);
2315 schedule();
2316
2317 remove_wait_queue(&bdflush_done, &wait);
2318 __set_current_state(TASK_RUNNING);
2319 }
2320
2321 /* This is the _only_ function that deals with flushing async writes
2322 to disk.
2323 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2324 as all dirty buffers lives _only_ in the DIRTY lru list.
2325 As we never browse the LOCKED and CLEAN lru lists they are infact
2326 completly useless. */
2327 static int flush_dirty_buffers(int check_flushtime)
2328 {
2329 struct buffer_head * bh, *next;
2330 int flushed = 0, i;
2331
2332 restart:
2333 spin_lock(&lru_list_lock);
2334 bh = lru_list[BUF_DIRTY];
2335 if (!bh)
2336 goto out_unlock;
2337 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2338 next = bh->b_next_free;
2339
2340 if (!buffer_dirty(bh)) {
2341 __refile_buffer(bh);
2342 continue;
2343 }
2344 if (buffer_locked(bh))
2345 continue;
2346
2347 if (check_flushtime) {
2348 /* The dirty lru list is chronologically ordered so
2349 if the current bh is not yet timed out,
2350 then also all the following bhs
2351 will be too young. */
2352 if (time_before(jiffies, bh->b_flushtime))
2353 goto out_unlock;
2354 } else {
2355 if (++flushed > bdf_prm.b_un.ndirty)
2356 goto out_unlock;
2357 }
2358
2359 /* OK, now we are committed to write it out. */
2360 atomic_inc(&bh->b_count);
2361 spin_unlock(&lru_list_lock);
2362 ll_rw_block(WRITE, 1, &bh);
2363 atomic_dec(&bh->b_count);
2364
2365 if (current->need_resched)
2366 schedule();
2367 goto restart;
2368 }
2369 out_unlock:
2370 spin_unlock(&lru_list_lock);
2371
2372 return flushed;
2373 }
2374
2375 /*
2376 * Here we attempt to write back old buffers. We also try to flush inodes
2377 * and supers as well, since this function is essentially "update", and
2378 * otherwise there would be no way of ensuring that these quantities ever
2379 * get written back. Ideally, we would have a timestamp on the inodes
2380 * and superblocks so that we could write back only the old ones as well
2381 */
2382
2383 static int sync_old_buffers(void)
2384 {
2385 lock_kernel();
2386 sync_supers(0);
2387 sync_inodes(0);
2388 unlock_kernel();
2389
2390 flush_dirty_buffers(1);
2391 /* must really sync all the active I/O request to disk here */
2392 run_task_queue(&tq_disk);
2393 return 0;
2394 }
2395
2396 /* This is the interface to bdflush. As we get more sophisticated, we can
2397 * pass tuning parameters to this "process", to adjust how it behaves.
2398 * We would want to verify each parameter, however, to make sure that it
2399 * is reasonable. */
2400
2401 asmlinkage long sys_bdflush(int func, long data)
2402 {
2403 if (!capable(CAP_SYS_ADMIN))
2404 return -EPERM;
2405
2406 if (func == 1) {
2407 /* do_exit directly and let kupdate to do its work alone. */
2408 do_exit(0);
2409 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2410 a syscall that doesn't care about the current mm context. */
2411 int error;
2412 struct mm_struct *user_mm;
2413
2414 /*
2415 * bdflush will spend all of it's time in kernel-space,
2416 * without touching user-space, so we can switch it into
2417 * 'lazy TLB mode' to reduce the cost of context-switches
2418 * to and from bdflush.
2419 */
2420 user_mm = start_lazy_tlb();
2421 error = sync_old_buffers();
2422 end_lazy_tlb(user_mm);
2423 return error;
2424 #endif
2425 }
2426
2427 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2428 if (func >= 2) {
2429 int i = (func-2) >> 1;
2430 if (i >= 0 && i < N_PARAM) {
2431 if ((func & 1) == 0)
2432 return put_user(bdf_prm.data[i], (int*)data);
2433
2434 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2435 bdf_prm.data[i] = data;
2436 return 0;
2437 }
2438 }
2439 return -EINVAL;
2440 }
2441
2442 /* Having func 0 used to launch the actual bdflush and then never
2443 * return (unless explicitly killed). We return zero here to
2444 * remain semi-compatible with present update(8) programs.
2445 */
2446 return 0;
2447 }
2448
2449 /*
2450 * This is the actual bdflush daemon itself. It used to be started from
2451 * the syscall above, but now we launch it ourselves internally with
2452 * kernel_thread(...) directly after the first thread in init/main.c
2453 */
2454 int bdflush(void * unused)
2455 {
2456 int flushed;
2457 /*
2458 * We have a bare-bones task_struct, and really should fill
2459 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2460 * display semi-sane things. Not real crucial though...
2461 */
2462
2463 current->session = 1;
2464 current->pgrp = 1;
2465 sprintf(current->comm, "kflushd");
2466 bdflush_tsk = current;
2467
2468 /* avoid getting signals */
2469 spin_lock_irq(¤t->sigmask_lock);
2470 flush_signals(current);
2471 sigfillset(¤t->blocked);
2472 recalc_sigpending(current);
2473 spin_unlock_irq(¤t->sigmask_lock);
2474
2475 for (;;) {
2476 CHECK_EMERGENCY_SYNC
2477
2478 flushed = flush_dirty_buffers(0);
2479
2480 /* If wakeup_bdflush will wakeup us
2481 after our bdflush_done wakeup, then
2482 we must make sure to not sleep
2483 in schedule_timeout otherwise
2484 wakeup_bdflush may wait for our
2485 bdflush_done wakeup that would never arrive
2486 (as we would be sleeping) and so it would
2487 deadlock in SMP. */
2488 __set_current_state(TASK_INTERRUPTIBLE);
2489 wake_up(&bdflush_done);
2490 /*
2491 * If there are still a lot of dirty buffers around,
2492 * skip the sleep and flush some more. Otherwise, we
2493 * go to sleep waiting a wakeup.
2494 */
2495 if (!flushed || balance_dirty_state(NODEV) < 0)
2496 schedule();
2497 /* Remember to mark us as running otherwise
2498 the next schedule will block. */
2499 __set_current_state(TASK_RUNNING);
2500 }
2501 }
2502
2503 /*
2504 * This is the kernel update daemon. It was used to live in userspace
2505 * but since it's need to run safely we want it unkillable by mistake.
2506 * You don't need to change your userspace configuration since
2507 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2508 */
2509 int kupdate(void * unused)
2510 {
2511 struct task_struct * tsk = current;
2512 int interval;
2513
2514 tsk->session = 1;
2515 tsk->pgrp = 1;
2516 strcpy(tsk->comm, "kupdate");
2517
2518 /* sigstop and sigcont will stop and wakeup kupdate */
2519 spin_lock_irq(&tsk->sigmask_lock);
2520 sigfillset(&tsk->blocked);
2521 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2522 recalc_sigpending(tsk);
2523 spin_unlock_irq(&tsk->sigmask_lock);
2524
2525 for (;;) {
2526 /* update interval */
2527 interval = bdf_prm.b_un.interval;
2528 if (interval) {
2529 tsk->state = TASK_INTERRUPTIBLE;
2530 schedule_timeout(interval);
2531 } else {
2532 stop_kupdate:
2533 tsk->state = TASK_STOPPED;
2534 schedule(); /* wait for SIGCONT */
2535 }
2536 /* check for sigstop */
2537 if (signal_pending(tsk)) {
2538 int stopped = 0;
2539 spin_lock_irq(&tsk->sigmask_lock);
2540 if (sigismember(&tsk->signal, SIGSTOP)) {
2541 sigdelset(&tsk->signal, SIGSTOP);
2542 stopped = 1;
2543 }
2544 recalc_sigpending(tsk);
2545 spin_unlock_irq(&tsk->sigmask_lock);
2546 if (stopped)
2547 goto stop_kupdate;
2548 }
2549 #ifdef DEBUG
2550 printk("kupdate() activated...\n");
2551 #endif
2552 sync_old_buffers();
2553 }
2554 }
2555
2556 static int __init bdflush_init(void)
2557 {
2558 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2559 kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2560 return 0;
2561 }
2562
2563 module_init(bdflush_init)
2564
2565
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.