~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/fs/buffer.c

Version: ~ [ 0.6-2.3.46 ] ~
Architecture: ~ [ um ] ~

** Warning: Cannot open xref database.

1 /* 2 * linux/fs/buffer.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 /* 8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have 9 * been avoided by NEVER letting an interrupt change a buffer (except for the 10 * data, of course), but instead letting the caller do it. 11 */ 12 13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */ 14 15 /* Removed a lot of unnecessary code and simplified things now that 16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 17 */ 18 19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating 20 * hash table, use SLAB cache for buffer heads. -DaveM 21 */ 22 23 /* Added 32k buffer block sizes - these are required older ARM systems. 24 * - RMK 25 */ 26 27 /* Thread it... -DaveM */ 28 29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */ 30 31 #include <linux/sched.h> 32 #include <linux/fs.h> 33 #include <linux/malloc.h> 34 #include <linux/locks.h> 35 #include <linux/errno.h> 36 #include <linux/swap.h> 37 #include <linux/smp_lock.h> 38 #include <linux/vmalloc.h> 39 #include <linux/blkdev.h> 40 #include <linux/sysrq.h> 41 #include <linux/file.h> 42 #include <linux/init.h> 43 #include <linux/quotaops.h> 44 #include <linux/iobuf.h> 45 #include <linux/highmem.h> 46 47 #include <asm/uaccess.h> 48 #include <asm/io.h> 49 #include <asm/bitops.h> 50 #include <asm/mmu_context.h> 51 52 #define NR_SIZES 7 53 static char buffersize_index[65] = 54 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 55 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 56 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 57 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 58 6}; 59 60 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) 61 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) 62 #define NR_RESERVED (2*MAX_BUF_PER_PAGE) 63 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 64 number of unused buffer heads */ 65 66 /* Anti-deadlock ordering: 67 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock 68 */ 69 70 /* 71 * Hash table gook.. 72 */ 73 static unsigned int bh_hash_mask = 0; 74 static unsigned int bh_hash_shift = 0; 75 static struct buffer_head **hash_table; 76 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED; 77 78 static struct buffer_head *lru_list[NR_LIST]; 79 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; 80 static int nr_buffers_type[NR_LIST] = {0,}; 81 static unsigned long size_buffers_type[NR_LIST] = {0,}; 82 83 static struct buffer_head * unused_list = NULL; 84 static int nr_unused_buffer_heads = 0; 85 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; 86 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); 87 88 struct bh_free_head { 89 struct buffer_head *list; 90 spinlock_t lock; 91 }; 92 static struct bh_free_head free_list[NR_SIZES]; 93 94 kmem_cache_t *bh_cachep; 95 96 static int grow_buffers(int size); 97 static void __refile_buffer(struct buffer_head *); 98 99 /* This is used by some architectures to estimate available memory. */ 100 atomic_t buffermem_pages = ATOMIC_INIT(0); 101 102 /* Here is the parameter block for the bdflush process. If you add or 103 * remove any of the parameters, make sure to update kernel/sysctl.c. 104 */ 105 106 #define N_PARAM 9 107 108 /* The dummy values in this structure are left in there for compatibility 109 * with old programs that play with the /proc entries. 110 */ 111 union bdflush_param { 112 struct { 113 int nfract; /* Percentage of buffer cache dirty to 114 activate bdflush */ 115 int ndirty; /* Maximum number of dirty blocks to write out per 116 wake-cycle */ 117 int nrefill; /* Number of clean buffers to try to obtain 118 each time we call refill */ 119 int nref_dirt; /* Dirty buffer threshold for activating bdflush 120 when trying to refill buffers. */ 121 int interval; /* jiffies delay between kupdate flushes */ 122 int age_buffer; /* Time for normal buffer to age before we flush it */ 123 int age_super; /* Time for superblock to age before we flush it */ 124 int dummy2; /* unused */ 125 int dummy3; /* unused */ 126 } b_un; 127 unsigned int data[N_PARAM]; 128 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}}; 129 130 /* These are the min and max parameter values that we will allow to be assigned */ 131 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1}; 132 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5}; 133 134 /* 135 * Rewrote the wait-routines to use the "new" wait-queue functionality, 136 * and getting rid of the cli-sti pairs. The wait-queue routines still 137 * need cli-sti, but now it's just a couple of 386 instructions or so. 138 * 139 * Note that the real wait_on_buffer() is an inline function that checks 140 * if 'b_wait' is set before calling this, so that the queues aren't set 141 * up unnecessarily. 142 */ 143 void __wait_on_buffer(struct buffer_head * bh) 144 { 145 struct task_struct *tsk = current; 146 DECLARE_WAITQUEUE(wait, tsk); 147 148 atomic_inc(&bh->b_count); 149 add_wait_queue(&bh->b_wait, &wait); 150 repeat: 151 run_task_queue(&tq_disk); 152 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 153 if (buffer_locked(bh)) { 154 schedule(); 155 goto repeat; 156 } 157 tsk->state = TASK_RUNNING; 158 remove_wait_queue(&bh->b_wait, &wait); 159 atomic_dec(&bh->b_count); 160 } 161 162 /* Call sync_buffers with wait!=0 to ensure that the call does not 163 * return until all buffer writes have completed. Sync() may return 164 * before the writes have finished; fsync() may not. 165 */ 166 167 /* Godamity-damn. Some buffers (bitmaps for filesystems) 168 * spontaneously dirty themselves without ever brelse being called. 169 * We will ultimately want to put these in a separate list, but for 170 * now we search all of the lists for dirty buffers. 171 */ 172 static int sync_buffers(kdev_t dev, int wait) 173 { 174 int i, retry, pass = 0, err = 0; 175 struct buffer_head * bh, *next; 176 177 /* One pass for no-wait, three for wait: 178 * 0) write out all dirty, unlocked buffers; 179 * 1) write out all dirty buffers, waiting if locked; 180 * 2) wait for completion by waiting for all buffers to unlock. 181 */ 182 do { 183 retry = 0; 184 185 /* We search all lists as a failsafe mechanism, not because we expect 186 * there to be dirty buffers on any of the other lists. 187 */ 188 repeat: 189 spin_lock(&lru_list_lock); 190 bh = lru_list[BUF_DIRTY]; 191 if (!bh) 192 goto repeat2; 193 194 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { 195 next = bh->b_next_free; 196 197 if (!lru_list[BUF_DIRTY]) 198 break; 199 if (dev && bh->b_dev != dev) 200 continue; 201 if (buffer_locked(bh)) { 202 /* Buffer is locked; skip it unless wait is 203 * requested AND pass > 0. 204 */ 205 if (!wait || !pass) { 206 retry = 1; 207 continue; 208 } 209 atomic_inc(&bh->b_count); 210 spin_unlock(&lru_list_lock); 211 wait_on_buffer (bh); 212 atomic_dec(&bh->b_count); 213 goto repeat; 214 } 215 216 /* If an unlocked buffer is not uptodate, there has 217 * been an IO error. Skip it. 218 */ 219 if (wait && buffer_req(bh) && !buffer_locked(bh) && 220 !buffer_dirty(bh) && !buffer_uptodate(bh)) { 221 err = -EIO; 222 continue; 223 } 224 225 /* Don't write clean buffers. Don't write ANY buffers 226 * on the third pass. 227 */ 228 if (!buffer_dirty(bh) || pass >= 2) 229 continue; 230 231 atomic_inc(&bh->b_count); 232 spin_unlock(&lru_list_lock); 233 ll_rw_block(WRITE, 1, &bh); 234 atomic_dec(&bh->b_count); 235 retry = 1; 236 goto repeat; 237 } 238 239 repeat2: 240 bh = lru_list[BUF_LOCKED]; 241 if (!bh) { 242 spin_unlock(&lru_list_lock); 243 break; 244 } 245 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) { 246 next = bh->b_next_free; 247 248 if (!lru_list[BUF_LOCKED]) 249 break; 250 if (dev && bh->b_dev != dev) 251 continue; 252 if (buffer_locked(bh)) { 253 /* Buffer is locked; skip it unless wait is 254 * requested AND pass > 0. 255 */ 256 if (!wait || !pass) { 257 retry = 1; 258 continue; 259 } 260 atomic_inc(&bh->b_count); 261 spin_unlock(&lru_list_lock); 262 wait_on_buffer (bh); 263 spin_lock(&lru_list_lock); 264 atomic_dec(&bh->b_count); 265 goto repeat2; 266 } 267 } 268 spin_unlock(&lru_list_lock); 269 270 /* If we are waiting for the sync to succeed, and if any dirty 271 * blocks were written, then repeat; on the second pass, only 272 * wait for buffers being written (do not pass to write any 273 * more buffers on the second pass). 274 */ 275 } while (wait && retry && ++pass<=2); 276 return err; 277 } 278 279 void sync_dev(kdev_t dev) 280 { 281 sync_supers(dev); 282 sync_inodes(dev); 283 DQUOT_SYNC(dev); 284 /* sync all the dirty buffers out to disk only _after_ all the 285 high level layers finished generated buffer dirty data 286 (or we'll return with some buffer still dirty on the blockdevice 287 so breaking the semantics of this call) */ 288 sync_buffers(dev, 0); 289 /* 290 * FIXME(eric) we need to sync the physical devices here. 291 * This is because some (scsi) controllers have huge amounts of 292 * cache onboard (hundreds of Mb), and we need to instruct 293 * them to commit all of the dirty memory to disk, and we should 294 * not return until this has happened. 295 * 296 * This would need to get implemented by going through the assorted 297 * layers so that each block major number can be synced, and this 298 * would call down into the upper and mid-layer scsi. 299 */ 300 } 301 302 int fsync_dev(kdev_t dev) 303 { 304 sync_buffers(dev, 0); 305 306 lock_kernel(); 307 sync_supers(dev); 308 sync_inodes(dev); 309 DQUOT_SYNC(dev); 310 unlock_kernel(); 311 312 return sync_buffers(dev, 1); 313 } 314 315 asmlinkage long sys_sync(void) 316 { 317 fsync_dev(0); 318 return 0; 319 } 320 321 /* 322 * filp may be NULL if called via the msync of a vma. 323 */ 324 325 int file_fsync(struct file *filp, struct dentry *dentry) 326 { 327 struct inode * inode = dentry->d_inode; 328 struct super_block * sb; 329 kdev_t dev; 330 int ret; 331 332 lock_kernel(); 333 /* sync the inode to buffers */ 334 write_inode_now(inode); 335 336 /* sync the superblock to buffers */ 337 sb = inode->i_sb; 338 wait_on_super(sb); 339 if (sb->s_op && sb->s_op->write_super) 340 sb->s_op->write_super(sb); 341 342 /* .. finally sync the buffers to disk */ 343 dev = inode->i_dev; 344 ret = sync_buffers(dev, 1); 345 unlock_kernel(); 346 return ret; 347 } 348 349 asmlinkage long sys_fsync(unsigned int fd) 350 { 351 struct file * file; 352 struct dentry * dentry; 353 struct inode * inode; 354 int err; 355 356 err = -EBADF; 357 file = fget(fd); 358 if (!file) 359 goto out; 360 361 dentry = file->f_dentry; 362 if (!dentry) 363 goto out_putf; 364 365 inode = dentry->d_inode; 366 if (!inode) 367 goto out_putf; 368 369 err = -EINVAL; 370 if (!file->f_op || !file->f_op->fsync) 371 goto out_putf; 372 373 /* We need to protect against concurrent writers.. */ 374 down(&inode->i_sem); 375 err = file->f_op->fsync(file, dentry); 376 up(&inode->i_sem); 377 378 out_putf: 379 fput(file); 380 out: 381 return err; 382 } 383 384 asmlinkage long sys_fdatasync(unsigned int fd) 385 { 386 struct file * file; 387 struct dentry * dentry; 388 struct inode * inode; 389 int err; 390 391 err = -EBADF; 392 file = fget(fd); 393 if (!file) 394 goto out; 395 396 dentry = file->f_dentry; 397 if (!dentry) 398 goto out_putf; 399 400 inode = dentry->d_inode; 401 if (!inode) 402 goto out_putf; 403 404 err = -EINVAL; 405 if (!file->f_op || !file->f_op->fsync) 406 goto out_putf; 407 408 /* this needs further work, at the moment it is identical to fsync() */ 409 down(&inode->i_sem); 410 err = file->f_op->fsync(file, dentry); 411 up(&inode->i_sem); 412 413 out_putf: 414 fput(file); 415 out: 416 return err; 417 } 418 419 /* After several hours of tedious analysis, the following hash 420 * function won. Do not mess with it... -DaveM 421 */ 422 #define _hashfn(dev,block) \ 423 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ 424 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12)))) 425 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)] 426 427 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) 428 { 429 if ((bh->b_next = *head) != NULL) 430 bh->b_next->b_pprev = &bh->b_next; 431 *head = bh; 432 bh->b_pprev = head; 433 } 434 435 static __inline__ void __hash_unlink(struct buffer_head *bh) 436 { 437 if (bh->b_pprev) { 438 if (bh->b_next) 439 bh->b_next->b_pprev = bh->b_pprev; 440 *(bh->b_pprev) = bh->b_next; 441 bh->b_pprev = NULL; 442 } 443 } 444 445 static void __insert_into_lru_list(struct buffer_head * bh, int blist) 446 { 447 struct buffer_head **bhp = &lru_list[blist]; 448 449 if(!*bhp) { 450 *bhp = bh; 451 bh->b_prev_free = bh; 452 } 453 bh->b_next_free = *bhp; 454 bh->b_prev_free = (*bhp)->b_prev_free; 455 (*bhp)->b_prev_free->b_next_free = bh; 456 (*bhp)->b_prev_free = bh; 457 nr_buffers_type[blist]++; 458 size_buffers_type[blist] += bh->b_size; 459 } 460 461 static void __remove_from_lru_list(struct buffer_head * bh, int blist) 462 { 463 if (bh->b_prev_free || bh->b_next_free) { 464 bh->b_prev_free->b_next_free = bh->b_next_free; 465 bh->b_next_free->b_prev_free = bh->b_prev_free; 466 if (lru_list[blist] == bh) 467 lru_list[blist] = bh->b_next_free; 468 if (lru_list[blist] == bh) 469 lru_list[blist] = NULL; 470 bh->b_next_free = bh->b_prev_free = NULL; 471 nr_buffers_type[blist]--; 472 size_buffers_type[blist] -= bh->b_size; 473 } 474 } 475 476 static void __remove_from_free_list(struct buffer_head * bh, int index) 477 { 478 if(bh->b_next_free == bh) 479 free_list[index].list = NULL; 480 else { 481 bh->b_prev_free->b_next_free = bh->b_next_free; 482 bh->b_next_free->b_prev_free = bh->b_prev_free; 483 if (free_list[index].list == bh) 484 free_list[index].list = bh->b_next_free; 485 } 486 bh->b_next_free = bh->b_prev_free = NULL; 487 } 488 489 /* must be called with both the hash_table_lock and the lru_list_lock 490 held */ 491 static void __remove_from_queues(struct buffer_head *bh) 492 { 493 __hash_unlink(bh); 494 __remove_from_lru_list(bh, bh->b_list); 495 } 496 497 static void insert_into_queues(struct buffer_head *bh) 498 { 499 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); 500 501 spin_lock(&lru_list_lock); 502 write_lock(&hash_table_lock); 503 __hash_link(bh, head); 504 __insert_into_lru_list(bh, bh->b_list); 505 write_unlock(&hash_table_lock); 506 spin_unlock(&lru_list_lock); 507 } 508 509 /* This function must only run if there are no other 510 * references _anywhere_ to this buffer head. 511 */ 512 static void put_last_free(struct buffer_head * bh) 513 { 514 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; 515 struct buffer_head **bhp = &head->list; 516 517 bh->b_state = 0; 518 519 spin_lock(&head->lock); 520 bh->b_dev = B_FREE; 521 if(!*bhp) { 522 *bhp = bh; 523 bh->b_prev_free = bh; 524 } 525 bh->b_next_free = *bhp; 526 bh->b_prev_free = (*bhp)->b_prev_free; 527 (*bhp)->b_prev_free->b_next_free = bh; 528 (*bhp)->b_prev_free = bh; 529 spin_unlock(&head->lock); 530 } 531 532 /* 533 * Why like this, I hear you say... The reason is race-conditions. 534 * As we don't lock buffers (unless we are reading them, that is), 535 * something might happen to it while we sleep (ie a read-error 536 * will force it bad). This shouldn't really happen currently, but 537 * the code is ready. 538 */ 539 struct buffer_head * get_hash_table(kdev_t dev, int block, int size) 540 { 541 struct buffer_head **head = &hash(dev, block); 542 struct buffer_head *bh; 543 544 read_lock(&hash_table_lock); 545 for(bh = *head; bh; bh = bh->b_next) 546 if (bh->b_blocknr == block && 547 bh->b_size == size && 548 bh->b_dev == dev) 549 break; 550 if (bh) 551 atomic_inc(&bh->b_count); 552 read_unlock(&hash_table_lock); 553 554 return bh; 555 } 556 557 unsigned int get_hardblocksize(kdev_t dev) 558 { 559 /* 560 * Get the hard sector size for the given device. If we don't know 561 * what it is, return 0. 562 */ 563 if (hardsect_size[MAJOR(dev)] != NULL) { 564 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)]; 565 if (blksize != 0) 566 return blksize; 567 } 568 569 /* 570 * We don't know what the hardware sector size for this device is. 571 * Return 0 indicating that we don't know. 572 */ 573 return 0; 574 } 575 576 /* If invalidate_buffers() will trash dirty buffers, it means some kind 577 of fs corruption is going on. Trashing dirty data always imply losing 578 information that was supposed to be just stored on the physical layer 579 by the user. 580 581 Thus invalidate_buffers in general usage is not allwowed to trash dirty 582 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. 583 584 NOTE: In the case where the user removed a removable-media-disk even if 585 there's still dirty data not synced on disk (due a bug in the device driver 586 or due an error of the user), by not destroying the dirty buffers we could 587 generate corruption also on the next media inserted, thus a parameter is 588 necessary to handle this case in the most safe way possible (trying 589 to not corrupt also the new disk inserted with the data belonging to 590 the old now corrupted disk). Also for the ramdisk the natural thing 591 to do in order to release the ramdisk memory is to destroy dirty buffers. 592 593 These are two special cases. Normal usage imply the device driver 594 to issue a sync on the device (without waiting I/O completation) and 595 then an invalidate_buffers call that doesn't trashes dirty buffers. */ 596 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) 597 { 598 int i, nlist, slept; 599 struct buffer_head * bh, * bh_next; 600 601 retry: 602 slept = 0; 603 spin_lock(&lru_list_lock); 604 for(nlist = 0; nlist < NR_LIST; nlist++) { 605 bh = lru_list[nlist]; 606 if (!bh) 607 continue; 608 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { 609 bh_next = bh->b_next_free; 610 if (bh->b_dev != dev) 611 continue; 612 if (buffer_locked(bh)) { 613 atomic_inc(&bh->b_count); 614 spin_unlock(&lru_list_lock); 615 wait_on_buffer(bh); 616 slept = 1; 617 spin_lock(&lru_list_lock); 618 atomic_dec(&bh->b_count); 619 } 620 621 write_lock(&hash_table_lock); 622 if (!atomic_read(&bh->b_count) && 623 (destroy_dirty_buffers || !buffer_dirty(bh))) { 624 __remove_from_queues(bh); 625 put_last_free(bh); 626 } 627 write_unlock(&hash_table_lock); 628 if (slept) 629 goto out; 630 } 631 } 632 out: 633 spin_unlock(&lru_list_lock); 634 if (slept) 635 goto retry; 636 } 637 638 void set_blocksize(kdev_t dev, int size) 639 { 640 extern int *blksize_size[]; 641 int i, nlist, slept; 642 struct buffer_head * bh, * bh_next; 643 644 if (!blksize_size[MAJOR(dev)]) 645 return; 646 647 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 648 if (size > PAGE_SIZE || size < 512 || (size & (size-1))) 649 panic("Invalid blocksize passed to set_blocksize"); 650 651 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) { 652 blksize_size[MAJOR(dev)][MINOR(dev)] = size; 653 return; 654 } 655 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size) 656 return; 657 sync_buffers(dev, 2); 658 blksize_size[MAJOR(dev)][MINOR(dev)] = size; 659 660 retry: 661 slept = 0; 662 spin_lock(&lru_list_lock); 663 for(nlist = 0; nlist < NR_LIST; nlist++) { 664 bh = lru_list[nlist]; 665 if (!bh) 666 continue; 667 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { 668 bh_next = bh->b_next_free; 669 if (bh->b_dev != dev || bh->b_size == size) 670 continue; 671 if (buffer_locked(bh)) { 672 atomic_inc(&bh->b_count); 673 spin_unlock(&lru_list_lock); 674 wait_on_buffer(bh); 675 slept = 1; 676 spin_lock(&lru_list_lock); 677 atomic_dec(&bh->b_count); 678 } 679 680 write_lock(&hash_table_lock); 681 if (!atomic_read(&bh->b_count)) { 682 if (buffer_dirty(bh)) 683 printk(KERN_WARNING 684 "set_blocksize: dev %s buffer_dirty %lu size %hu\n", 685 kdevname(dev), bh->b_blocknr, bh->b_size); 686 __remove_from_queues(bh); 687 put_last_free(bh); 688 } else { 689 if (atomic_set_buffer_clean(bh)) 690 __refile_buffer(bh); 691 clear_bit(BH_Uptodate, &bh->b_state); 692 printk(KERN_WARNING 693 "set_blocksize: " 694 "b_count %d, dev %s, block %lu, from %p\n", 695 atomic_read(&bh->b_count), bdevname(bh->b_dev), 696 bh->b_blocknr, __builtin_return_address(0)); 697 } 698 write_unlock(&hash_table_lock); 699 if (slept) 700 goto out; 701 } 702 } 703 out: 704 spin_unlock(&lru_list_lock); 705 if (slept) 706 goto retry; 707 } 708 709 /* 710 * We used to try various strange things. Let's not. 711 */ 712 static void refill_freelist(int size) 713 { 714 if (!grow_buffers(size)) { 715 wakeup_bdflush(1); 716 current->policy |= SCHED_YIELD; 717 schedule(); 718 } 719 } 720 721 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id) 722 { 723 bh->b_list = BUF_CLEAN; 724 bh->b_end_io = handler; 725 bh->b_dev_id = dev_id; 726 } 727 728 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate) 729 { 730 mark_buffer_uptodate(bh, uptodate); 731 unlock_buffer(bh); 732 } 733 734 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate) 735 { 736 mark_buffer_uptodate(bh, uptodate); 737 unlock_buffer(bh); 738 BUG(); 739 } 740 741 static void end_buffer_io_async(struct buffer_head * bh, int uptodate) 742 { 743 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; 744 unsigned long flags; 745 struct buffer_head *tmp; 746 struct page *page; 747 748 mark_buffer_uptodate(bh, uptodate); 749 750 /* This is a temporary buffer used for page I/O. */ 751 page = bh->b_page; 752 753 if (!uptodate) 754 SetPageError(page); 755 756 /* 757 * Be _very_ careful from here on. Bad things can happen if 758 * two buffer heads end IO at almost the same time and both 759 * decide that the page is now completely done. 760 * 761 * Async buffer_heads are here only as labels for IO, and get 762 * thrown away once the IO for this page is complete. IO is 763 * deemed complete once all buffers have been visited 764 * (b_count==0) and are now unlocked. We must make sure that 765 * only the _last_ buffer that decrements its count is the one 766 * that unlock the page.. 767 */ 768 spin_lock_irqsave(&page_uptodate_lock, flags); 769 unlock_buffer(bh); 770 atomic_dec(&bh->b_count); 771 tmp = bh->b_this_page; 772 while (tmp != bh) { 773 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp)) 774 goto still_busy; 775 tmp = tmp->b_this_page; 776 } 777 778 /* OK, the async IO on this page is complete. */ 779 spin_unlock_irqrestore(&page_uptodate_lock, flags); 780 781 /* 782 * if none of the buffers had errors then we can set the 783 * page uptodate: 784 */ 785 if (!PageError(page)) 786 SetPageUptodate(page); 787 788 /* 789 * Run the hooks that have to be done when a page I/O has completed. 790 */ 791 if (test_and_clear_bit(PG_decr_after, &page->flags)) 792 atomic_dec(&nr_async_pages); 793 794 UnlockPage(page); 795 796 return; 797 798 still_busy: 799 spin_unlock_irqrestore(&page_uptodate_lock, flags); 800 return; 801 } 802 803 /* 804 * Ok, this is getblk, and it isn't very clear, again to hinder 805 * race-conditions. Most of the code is seldom used, (ie repeating), 806 * so it should be much more efficient than it looks. 807 * 808 * The algorithm is changed: hopefully better, and an elusive bug removed. 809 * 810 * 14.02.92: changed it to sync dirty buffers a bit: better performance 811 * when the filesystem starts to get full of dirty blocks (I hope). 812 */ 813 struct buffer_head * getblk(kdev_t dev, int block, int size) 814 { 815 struct buffer_head * bh; 816 int isize; 817 818 repeat: 819 bh = get_hash_table(dev, block, size); 820 if (bh) 821 goto out; 822 823 isize = BUFSIZE_INDEX(size); 824 spin_lock(&free_list[isize].lock); 825 bh = free_list[isize].list; 826 if (bh) { 827 __remove_from_free_list(bh, isize); 828 atomic_set(&bh->b_count, 1); 829 } 830 spin_unlock(&free_list[isize].lock); 831 832 /* 833 * OK, FINALLY we know that this buffer is the only one of 834 * its kind, we hold a reference (b_count>0), it is unlocked, 835 * and it is clean. 836 */ 837 if (bh) { 838 init_buffer(bh, end_buffer_io_sync, NULL); 839 bh->b_dev = dev; 840 bh->b_blocknr = block; 841 bh->b_state = 1 << BH_Mapped; 842 843 /* Insert the buffer into the regular lists */ 844 insert_into_queues(bh); 845 out: 846 touch_buffer(bh); 847 return bh; 848 } 849 850 /* 851 * If we block while refilling the free list, somebody may 852 * create the buffer first ... search the hashes again. 853 */ 854 refill_freelist(size); 855 goto repeat; 856 } 857 858 /* -1 -> no need to flush 859 0 -> async flush 860 1 -> sync flush (wait for I/O completation) */ 861 static int balance_dirty_state(kdev_t dev) 862 { 863 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; 864 865 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; 866 tot = nr_free_buffer_pages(); 867 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT; 868 869 dirty *= 200; 870 soft_dirty_limit = tot * bdf_prm.b_un.nfract; 871 hard_dirty_limit = soft_dirty_limit * 2; 872 873 if (dirty > soft_dirty_limit) { 874 if (dirty > hard_dirty_limit) 875 return 1; 876 return 0; 877 } 878 return -1; 879 } 880 881 /* 882 * if a new dirty buffer is created we need to balance bdflush. 883 * 884 * in the future we might want to make bdflush aware of different 885 * pressures on different devices - thus the (currently unused) 886 * 'dev' parameter. 887 */ 888 void balance_dirty(kdev_t dev) 889 { 890 int state = balance_dirty_state(dev); 891 892 if (state < 0) 893 return; 894 wakeup_bdflush(state); 895 } 896 897 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag) 898 { 899 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); 900 refile_buffer(bh); 901 } 902 903 /* atomic version, the user must call balance_dirty() by hand 904 as soon as it become possible to block */ 905 void __mark_buffer_dirty(struct buffer_head *bh, int flag) 906 { 907 if (!atomic_set_buffer_dirty(bh)) 908 __mark_dirty(bh, flag); 909 } 910 911 void mark_buffer_dirty(struct buffer_head *bh, int flag) 912 { 913 __mark_buffer_dirty(bh, flag); 914 balance_dirty(bh->b_dev); 915 } 916 917 /* 918 * A buffer may need to be moved from one buffer list to another 919 * (e.g. in case it is not shared any more). Handle this. 920 */ 921 static void __refile_buffer(struct buffer_head *bh) 922 { 923 int dispose = BUF_CLEAN; 924 if (buffer_locked(bh)) 925 dispose = BUF_LOCKED; 926 if (buffer_dirty(bh)) 927 dispose = BUF_DIRTY; 928 if (buffer_protected(bh)) 929 dispose = BUF_PROTECTED; 930 if (dispose != bh->b_list) { 931 __remove_from_lru_list(bh, bh->b_list); 932 bh->b_list = dispose; 933 __insert_into_lru_list(bh, dispose); 934 } 935 } 936 937 void refile_buffer(struct buffer_head *bh) 938 { 939 spin_lock(&lru_list_lock); 940 __refile_buffer(bh); 941 spin_unlock(&lru_list_lock); 942 } 943 944 /* 945 * Release a buffer head 946 */ 947 void __brelse(struct buffer_head * buf) 948 { 949 if (atomic_read(&buf->b_count)) { 950 atomic_dec(&buf->b_count); 951 return; 952 } 953 printk("VFS: brelse: Trying to free free buffer\n"); 954 } 955 956 /* 957 * bforget() is like brelse(), except it puts the buffer on the 958 * free list if it can.. We can NOT free the buffer if: 959 * - there are other users of it 960 * - it is locked and thus can have active IO 961 */ 962 void __bforget(struct buffer_head * buf) 963 { 964 /* grab the lru lock here to block bdflush. */ 965 spin_lock(&lru_list_lock); 966 write_lock(&hash_table_lock); 967 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) 968 goto in_use; 969 __hash_unlink(buf); 970 write_unlock(&hash_table_lock); 971 __remove_from_lru_list(buf, buf->b_list); 972 spin_unlock(&lru_list_lock); 973 put_last_free(buf); 974 return; 975 976 in_use: 977 write_unlock(&hash_table_lock); 978 spin_unlock(&lru_list_lock); 979 } 980 981 /* 982 * bread() reads a specified block and returns the buffer that contains 983 * it. It returns NULL if the block was unreadable. 984 */ 985 struct buffer_head * bread(kdev_t dev, int block, int size) 986 { 987 struct buffer_head * bh; 988 989 bh = getblk(dev, block, size); 990 if (buffer_uptodate(bh)) 991 return bh; 992 ll_rw_block(READ, 1, &bh); 993 wait_on_buffer(bh); 994 if (buffer_uptodate(bh)) 995 return bh; 996 brelse(bh); 997 return NULL; 998 } 999 1000 /* 1001 * Ok, breada can be used as bread, but additionally to mark other 1002 * blocks for reading as well. End the argument list with a negative 1003 * number. 1004 */ 1005 1006 #define NBUF 16 1007 1008 struct buffer_head * breada(kdev_t dev, int block, int bufsize, 1009 unsigned int pos, unsigned int filesize) 1010 { 1011 struct buffer_head * bhlist[NBUF]; 1012 unsigned int blocks; 1013 struct buffer_head * bh; 1014 int index; 1015 int i, j; 1016 1017 if (pos >= filesize) 1018 return NULL; 1019 1020 if (block < 0) 1021 return NULL; 1022 1023 bh = getblk(dev, block, bufsize); 1024 index = BUFSIZE_INDEX(bh->b_size); 1025 1026 if (buffer_uptodate(bh)) 1027 return(bh); 1028 else ll_rw_block(READ, 1, &bh); 1029 1030 blocks = (filesize - pos) >> (9+index); 1031 1032 if (blocks < (read_ahead[MAJOR(dev)] >> index)) 1033 blocks = read_ahead[MAJOR(dev)] >> index; 1034 if (blocks > NBUF) 1035 blocks = NBUF; 1036 1037 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */ 1038 1039 bhlist[0] = bh; 1040 j = 1; 1041 for(i=1; i<blocks; i++) { 1042 bh = getblk(dev,block+i,bufsize); 1043 if (buffer_uptodate(bh)) { 1044 brelse(bh); 1045 break; 1046 } 1047 else bhlist[j++] = bh; 1048 } 1049 1050 /* Request the read for these buffers, and then release them. */ 1051 if (j>1) 1052 ll_rw_block(READA, (j-1), bhlist+1); 1053 for(i=1; i<j; i++) 1054 brelse(bhlist[i]); 1055 1056 /* Wait for this buffer, and then continue on. */ 1057 bh = bhlist[0]; 1058 wait_on_buffer(bh); 1059 if (buffer_uptodate(bh)) 1060 return bh; 1061 brelse(bh); 1062 return NULL; 1063 } 1064 1065 /* 1066 * Note: the caller should wake up the buffer_wait list if needed. 1067 */ 1068 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) 1069 { 1070 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { 1071 kmem_cache_free(bh_cachep, bh); 1072 } else { 1073 bh->b_blocknr = -1; 1074 init_waitqueue_head(&bh->b_wait); 1075 nr_unused_buffer_heads++; 1076 bh->b_next_free = unused_list; 1077 bh->b_this_page = NULL; 1078 unused_list = bh; 1079 } 1080 } 1081 1082 /* 1083 * Reserve NR_RESERVED buffer heads for async IO requests to avoid 1084 * no-buffer-head deadlock. Return NULL on failure; waiting for 1085 * buffer heads is now handled in create_buffers(). 1086 */ 1087 static struct buffer_head * get_unused_buffer_head(int async) 1088 { 1089 struct buffer_head * bh; 1090 1091 spin_lock(&unused_list_lock); 1092 if (nr_unused_buffer_heads > NR_RESERVED) { 1093 bh = unused_list; 1094 unused_list = bh->b_next_free; 1095 nr_unused_buffer_heads--; 1096 spin_unlock(&unused_list_lock); 1097 return bh; 1098 } 1099 spin_unlock(&unused_list_lock); 1100 1101 /* This is critical. We can't swap out pages to get 1102 * more buffer heads, because the swap-out may need 1103 * more buffer-heads itself. Thus SLAB_BUFFER. 1104 */ 1105 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) { 1106 memset(bh, 0, sizeof(*bh)); 1107 init_waitqueue_head(&bh->b_wait); 1108 return bh; 1109 } 1110 1111 /* 1112 * If we need an async buffer, use the reserved buffer heads. 1113 */ 1114 if (async) { 1115 spin_lock(&unused_list_lock); 1116 if (unused_list) { 1117 bh = unused_list; 1118 unused_list = bh->b_next_free; 1119 nr_unused_buffer_heads--; 1120 spin_unlock(&unused_list_lock); 1121 return bh; 1122 } 1123 spin_unlock(&unused_list_lock); 1124 } 1125 #if 0 1126 /* 1127 * (Pending further analysis ...) 1128 * Ordinary (non-async) requests can use a different memory priority 1129 * to free up pages. Any swapping thus generated will use async 1130 * buffer heads. 1131 */ 1132 if(!async && 1133 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { 1134 memset(bh, 0, sizeof(*bh)); 1135 init_waitqueue_head(&bh->b_wait); 1136 return bh; 1137 } 1138 #endif 1139 1140 return NULL; 1141 } 1142 1143 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) 1144 { 1145 bh->b_page = page; 1146 if (offset >= PAGE_SIZE) 1147 BUG(); 1148 if (PageHighMem(page)) 1149 /* 1150 * This catches illegal uses and preserves the offset: 1151 */ 1152 bh->b_data = (char *)(0 + offset); 1153 else 1154 bh->b_data = (char *)(page_address(page) + offset); 1155 } 1156 1157 /* 1158 * Create the appropriate buffers when given a page for data area and 1159 * the size of each buffer.. Use the bh->b_this_page linked list to 1160 * follow the buffers created. Return NULL if unable to create more 1161 * buffers. 1162 * The async flag is used to differentiate async IO (paging, swapping) 1163 * from ordinary buffer allocations, and only async requests are allowed 1164 * to sleep waiting for buffer heads. 1165 */ 1166 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) 1167 { 1168 struct buffer_head *bh, *head; 1169 long offset; 1170 1171 try_again: 1172 head = NULL; 1173 offset = PAGE_SIZE; 1174 while ((offset -= size) >= 0) { 1175 bh = get_unused_buffer_head(async); 1176 if (!bh) 1177 goto no_grow; 1178 1179 bh->b_dev = B_FREE; /* Flag as unused */ 1180 bh->b_this_page = head; 1181 head = bh; 1182 1183 bh->b_state = 0; 1184 bh->b_next_free = NULL; 1185 bh->b_pprev = NULL; 1186 atomic_set(&bh->b_count, 0); 1187 bh->b_size = size; 1188 1189 set_bh_page(bh, page, offset); 1190 1191 bh->b_list = BUF_CLEAN; 1192 bh->b_end_io = end_buffer_io_bad; 1193 } 1194 return head; 1195 /* 1196 * In case anything failed, we just free everything we got. 1197 */ 1198 no_grow: 1199 if (head) { 1200 spin_lock(&unused_list_lock); 1201 do { 1202 bh = head; 1203 head = head->b_this_page; 1204 __put_unused_buffer_head(bh); 1205 } while (head); 1206 spin_unlock(&unused_list_lock); 1207 1208 /* Wake up any waiters ... */ 1209 wake_up(&buffer_wait); 1210 } 1211 1212 /* 1213 * Return failure for non-async IO requests. Async IO requests 1214 * are not allowed to fail, so we have to wait until buffer heads 1215 * become available. But we don't want tasks sleeping with 1216 * partially complete buffers, so all were released above. 1217 */ 1218 if (!async) 1219 return NULL; 1220 1221 /* We're _really_ low on memory. Now we just 1222 * wait for old buffer heads to become free due to 1223 * finishing IO. Since this is an async request and 1224 * the reserve list is empty, we're sure there are 1225 * async buffer heads in use. 1226 */ 1227 run_task_queue(&tq_disk); 1228 1229 /* 1230 * Set our state for sleeping, then check again for buffer heads. 1231 * This ensures we won't miss a wake_up from an interrupt. 1232 */ 1233 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE); 1234 goto try_again; 1235 } 1236 1237 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) 1238 { 1239 struct buffer_head *head, *bh, *tail; 1240 int block; 1241 1242 if (!PageLocked(page)) 1243 BUG(); 1244 /* 1245 * Allocate async buffer heads pointing to this page, just for I/O. 1246 * They don't show up in the buffer hash table, but they *are* 1247 * registered in page->buffers. 1248 */ 1249 head = create_buffers(page, size, 1); 1250 if (page->buffers) 1251 BUG(); 1252 if (!head) 1253 BUG(); 1254 tail = head; 1255 for (bh = head; bh; bh = bh->b_this_page) { 1256 block = *(b++); 1257 1258 tail = bh; 1259 init_buffer(bh, end_buffer_io_async, NULL); 1260 bh->b_dev = dev; 1261 bh->b_blocknr = block; 1262 1263 set_bit(BH_Mapped, &bh->b_state); 1264 } 1265 tail->b_this_page = head; 1266 get_page(page); 1267 page->buffers = head; 1268 return 0; 1269 } 1270 1271 static void unmap_buffer(struct buffer_head * bh) 1272 { 1273 if (buffer_mapped(bh)) { 1274 mark_buffer_clean(bh); 1275 wait_on_buffer(bh); 1276 clear_bit(BH_Uptodate, &bh->b_state); 1277 clear_bit(BH_Mapped, &bh->b_state); 1278 clear_bit(BH_Req, &bh->b_state); 1279 clear_bit(BH_New, &bh->b_state); 1280 } 1281 } 1282 1283 /* 1284 * We don't have to release all buffers here, but 1285 * we have to be sure that no dirty buffer is left 1286 * and no IO is going on (no buffer is locked), because 1287 * we have truncated the file and are going to free the 1288 * blocks on-disk.. 1289 */ 1290 int block_flushpage(struct page *page, unsigned long offset) 1291 { 1292 struct buffer_head *head, *bh, *next; 1293 unsigned int curr_off = 0; 1294 1295 if (!PageLocked(page)) 1296 BUG(); 1297 if (!page->buffers) 1298 return 1; 1299 1300 head = page->buffers; 1301 bh = head; 1302 do { 1303 unsigned int next_off = curr_off + bh->b_size; 1304 next = bh->b_this_page; 1305 1306 /* 1307 * is this block fully flushed? 1308 */ 1309 if (offset <= curr_off) 1310 unmap_buffer(bh); 1311 curr_off = next_off; 1312 bh = next; 1313 } while (bh != head); 1314 1315 /* 1316 * subtle. We release buffer-heads only if this is 1317 * the 'final' flushpage. We have invalidated the get_block 1318 * cached value unconditionally, so real IO is not 1319 * possible anymore. 1320 * 1321 * If the free doesn't work out, the buffers can be 1322 * left around - they just turn into anonymous buffers 1323 * instead. 1324 */ 1325 if (!offset) { 1326 if (!try_to_free_buffers(page)) { 1327 atomic_inc(&buffermem_pages); 1328 return 0; 1329 } 1330 } 1331 1332 return 1; 1333 } 1334 1335 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize) 1336 { 1337 struct buffer_head *bh, *head, *tail; 1338 1339 head = create_buffers(page, blocksize, 1); 1340 if (page->buffers) 1341 BUG(); 1342 1343 bh = head; 1344 do { 1345 bh->b_dev = inode->i_dev; 1346 bh->b_blocknr = 0; 1347 bh->b_end_io = end_buffer_io_bad; 1348 tail = bh; 1349 bh = bh->b_this_page; 1350 } while (bh); 1351 tail->b_this_page = head; 1352 page->buffers = head; 1353 get_page(page); 1354 } 1355 1356 static void unmap_underlying_metadata(struct buffer_head * bh) 1357 { 1358 struct buffer_head *old_bh; 1359 1360 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); 1361 if (old_bh) { 1362 unmap_buffer(old_bh); 1363 /* Here we could run brelse or bforget. We use 1364 bforget because it will try to put the buffer 1365 in the freelist. */ 1366 __bforget(old_bh); 1367 } 1368 } 1369 1370 /* 1371 * block_write_full_page() is SMP-safe - currently it's still 1372 * being called with the kernel lock held, but the code is ready. 1373 */ 1374 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) 1375 { 1376 int err, i, need_balance_dirty = 0; 1377 unsigned long block; 1378 struct buffer_head *bh, *head; 1379 1380 if (!PageLocked(page)) 1381 BUG(); 1382 1383 if (!page->buffers) 1384 create_empty_buffers(page, inode, inode->i_sb->s_blocksize); 1385 head = page->buffers; 1386 1387 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1388 1389 bh = head; 1390 i = 0; 1391 do { 1392 /* 1393 * If the buffer isn't up-to-date, we can't be sure 1394 * that the buffer has been initialized with the proper 1395 * block number information etc.. 1396 * 1397 * Leave it to the low-level FS to make all those 1398 * decisions (block #0 may actually be a valid block) 1399 */ 1400 bh->b_end_io = end_buffer_io_sync; 1401 if (!buffer_mapped(bh)) { 1402 err = get_block(inode, block, bh, 1); 1403 if (err) 1404 goto out; 1405 if (buffer_new(bh)) 1406 unmap_underlying_metadata(bh); 1407 } 1408 set_bit(BH_Uptodate, &bh->b_state); 1409 if (!atomic_set_buffer_dirty(bh)) { 1410 __mark_dirty(bh, 0); 1411 need_balance_dirty = 1; 1412 } 1413 1414 bh = bh->b_this_page; 1415 block++; 1416 } while (bh != head); 1417 1418 if (need_balance_dirty) 1419 balance_dirty(bh->b_dev); 1420 1421 SetPageUptodate(page); 1422 return 0; 1423 out: 1424 ClearPageUptodate(page); 1425 return err; 1426 } 1427 1428 static int __block_prepare_write(struct inode *inode, struct page *page, 1429 unsigned from, unsigned to, get_block_t *get_block) 1430 { 1431 unsigned block_start, block_end; 1432 unsigned long block; 1433 int err = 0; 1434 unsigned blocksize, bbits; 1435 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; 1436 char *kaddr = (char *)kmap(page); 1437 1438 blocksize = inode->i_sb->s_blocksize; 1439 if (!page->buffers) 1440 create_empty_buffers(page, inode, blocksize); 1441 head = page->buffers; 1442 1443 bbits = inode->i_sb->s_blocksize_bits; 1444 block = page->index << (PAGE_CACHE_SHIFT - bbits); 1445 1446 for(bh = head, block_start = 0; bh != head || !block_start; 1447 block++, block_start=block_end, bh = bh->b_this_page) { 1448 if (!bh) 1449 BUG(); 1450 block_end = block_start+blocksize; 1451 if (block_end <= from) 1452 continue; 1453 if (block_start >= to) 1454 break; 1455 bh->b_end_io = end_buffer_io_sync; 1456 if (!buffer_mapped(bh)) { 1457 err = get_block(inode, block, bh, 1); 1458 if (err) 1459 goto out; 1460 if (buffer_new(bh)) { 1461 unmap_underlying_metadata(bh); 1462 if (block_end > to) 1463 memset(kaddr+to, 0, block_end-to); 1464 if (block_start < from) 1465 memset(kaddr+block_start, 0, from-block_start); 1466 continue; 1467 } 1468 } 1469 if (!buffer_uptodate(bh) && 1470 (block_start < from || block_end > to)) { 1471 ll_rw_block(READ, 1, &bh); 1472 *wait_bh++=bh; 1473 } 1474 } 1475 /* 1476 * If we issued read requests - let them complete. 1477 */ 1478 while(wait_bh > wait) { 1479 wait_on_buffer(*--wait_bh); 1480 err = -EIO; 1481 if (!buffer_uptodate(*wait_bh)) 1482 goto out; 1483 } 1484 return 0; 1485 out: 1486 return err; 1487 } 1488 1489 static int __block_commit_write(struct inode *inode, struct page *page, 1490 unsigned from, unsigned to) 1491 { 1492 unsigned block_start, block_end; 1493 int partial = 0, need_balance_dirty = 0; 1494 unsigned blocksize; 1495 struct buffer_head *bh, *head; 1496 1497 blocksize = inode->i_sb->s_blocksize; 1498 1499 for(bh = head = page->buffers, block_start = 0; 1500 bh != head || !block_start; 1501 block_start=block_end, bh = bh->b_this_page) { 1502 block_end = block_start + blocksize; 1503 if (block_end <= from || block_start >= to) { 1504 if (!buffer_uptodate(bh)) 1505 partial = 1; 1506 } else { 1507 set_bit(BH_Uptodate, &bh->b_state); 1508 if (!atomic_set_buffer_dirty(bh)) { 1509 __mark_dirty(bh, 0); 1510 need_balance_dirty = 1; 1511 } 1512 } 1513 } 1514 1515 if (need_balance_dirty) 1516 balance_dirty(bh->b_dev); 1517 /* 1518 * is this a partial write that happened to make all buffers 1519 * uptodate then we can optimize away a bogus readpage() for 1520 * the next read(). Here we 'discover' wether the page went 1521 * uptodate as a result of this (potentially partial) write. 1522 */ 1523 if (!partial) 1524 SetPageUptodate(page); 1525 return 0; 1526 } 1527 1528 /* 1529 * Generic "read page" function for block devices that have the normal 1530 * get_block functionality. This is most of the block device filesystems. 1531 * Reads the page asynchronously --- the unlock_buffer() and 1532 * mark_buffer_uptodate() functions propagate buffer state into the 1533 * page struct once IO has completed. 1534 */ 1535 static inline int __block_read_full_page(struct inode *inode, struct page *page, 1536 get_block_t *get_block) 1537 { 1538 unsigned long iblock; 1539 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 1540 unsigned int blocksize, blocks; 1541 unsigned long kaddr = 0; 1542 int nr, i; 1543 1544 if (!PageLocked(page)) 1545 PAGE_BUG(page); 1546 blocksize = inode->i_sb->s_blocksize; 1547 if (!page->buffers) 1548 create_empty_buffers(page, inode, blocksize); 1549 head = page->buffers; 1550 1551 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; 1552 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1553 bh = head; 1554 nr = 0; 1555 i = 0; 1556 1557 do { 1558 if (buffer_uptodate(bh)) 1559 continue; 1560 1561 if (!buffer_mapped(bh)) { 1562 get_block(inode, iblock, bh, 0); 1563 if (!buffer_mapped(bh)) { 1564 if (!kaddr) 1565 kaddr = kmap(page); 1566 memset((char *)(kaddr + i*blocksize), 0, blocksize); 1567 set_bit(BH_Uptodate, &bh->b_state); 1568 continue; 1569 } 1570 } 1571 1572 init_buffer(bh, end_buffer_io_async, NULL); 1573 atomic_inc(&bh->b_count); 1574 arr[nr] = bh; 1575 nr++; 1576 } while (i++, iblock++, (bh = bh->b_this_page) != head); 1577 1578 ++current->maj_flt; 1579 if (nr) { 1580 if (Page_Uptodate(page)) 1581 BUG(); 1582 ll_rw_block(READ, nr, arr); 1583 } else { 1584 /* 1585 * all buffers are uptodate - we can set the page 1586 * uptodate as well. 1587 */ 1588 SetPageUptodate(page); 1589 UnlockPage(page); 1590 } 1591 if (kaddr) 1592 kunmap(page); 1593 return 0; 1594 } 1595 1596 /* 1597 * For moronic filesystems that do not allow holes in file. 1598 * We may have to extend the file. 1599 */ 1600 1601 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes) 1602 { 1603 struct address_space *mapping = page->mapping; 1604 struct inode *inode = (struct inode*)mapping->host; 1605 struct page *new_page; 1606 unsigned long pgpos; 1607 long status; 1608 unsigned zerofrom; 1609 unsigned blocksize = inode->i_sb->s_blocksize; 1610 char *kaddr; 1611 1612 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { 1613 status = -ENOMEM; 1614 new_page = grab_cache_page(mapping, pgpos); 1615 if (!new_page) 1616 goto out; 1617 /* we might sleep */ 1618 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { 1619 UnlockPage(new_page); 1620 page_cache_release(new_page); 1621 continue; 1622 } 1623 zerofrom = *bytes & ~PAGE_CACHE_MASK; 1624 if (zerofrom & (blocksize-1)) { 1625 *bytes |= (blocksize-1); 1626 (*bytes)++; 1627 } 1628 status = __block_prepare_write(inode, new_page, zerofrom, 1629 PAGE_CACHE_SIZE, get_block); 1630 if (status) 1631 goto out_unmap; 1632 kaddr = (char*)page_address(page); 1633 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); 1634 __block_commit_write(inode, new_page, zerofrom, to); 1635 kunmap(new_page); 1636 UnlockPage(new_page); 1637 page_cache_release(new_page); 1638 } 1639 1640 if (page->index < pgpos) { 1641 /* completely inside the area */ 1642 zerofrom = offset; 1643 } else { 1644 /* page covers the boundary, find the boundary offset */ 1645 zerofrom = *bytes & ~PAGE_CACHE_MASK; 1646 1647 /* if we will expand the thing last block will be filled */ 1648 if (to > zerofrom && (zerofrom & (blocksize-1))) { 1649 *bytes |= (blocksize-1); 1650 (*bytes)++; 1651 } 1652 1653 /* starting below the boundary? Nothing to zero out */ 1654 if (offset <= zerofrom) 1655 zerofrom = offset; 1656 } 1657 status = __block_prepare_write(inode, page, zerofrom, to, get_block); 1658 if (status) 1659 goto out1; 1660 kaddr = (char*)page_address(page); 1661 if (zerofrom < offset) { 1662 memset(kaddr+zerofrom, 0, offset-zerofrom); 1663 __block_commit_write(inode, page, zerofrom, offset); 1664 } 1665 return 0; 1666 out1: 1667 ClearPageUptodate(page); 1668 kunmap(page); 1669 return status; 1670 1671 out_unmap: 1672 ClearPageUptodate(new_page); 1673 kunmap(new_page); 1674 UnlockPage(new_page); 1675 page_cache_release(new_page); 1676 out: 1677 return status; 1678 } 1679 1680 int block_prepare_write(struct page *page, unsigned from, unsigned to, 1681 get_block_t *get_block) 1682 { 1683 struct inode *inode = (struct inode*)page->mapping->host; 1684 int err = __block_prepare_write(inode, page, from, to, get_block); 1685 if (err) { 1686 ClearPageUptodate(page); 1687 kunmap(page); 1688 } 1689 return err; 1690 } 1691 1692 int generic_commit_write(struct file *file, struct page *page, 1693 unsigned from, unsigned to) 1694 { 1695 __block_commit_write((struct inode*)page->mapping->host,page,from,to); 1696 kunmap(page); 1697 return 0; 1698 } 1699 1700 int block_write_full_page(struct page *page, get_block_t *get_block) 1701 { 1702 struct inode *inode = (struct inode*)page->mapping->host; 1703 return __block_write_full_page(inode, page, get_block); 1704 } 1705 1706 int block_read_full_page(struct page *page, get_block_t *get_block) 1707 { 1708 struct inode *inode = (struct inode*)page->mapping->host; 1709 return __block_read_full_page(inode, page, get_block); 1710 } 1711 1712 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block) 1713 { 1714 struct buffer_head tmp; 1715 struct inode *inode = (struct inode*)mapping->host; 1716 tmp.b_state = 0; 1717 tmp.b_blocknr = 0; 1718 get_block(inode, block, &tmp, 0); 1719 return tmp.b_blocknr; 1720 } 1721 1722 /* 1723 * IO completion routine for a buffer_head being used for kiobuf IO: we 1724 * can't dispatch the kiobuf callback until io_count reaches 0. 1725 */ 1726 1727 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) 1728 { 1729 struct kiobuf *kiobuf; 1730 1731 mark_buffer_uptodate(bh, uptodate); 1732 1733 kiobuf = bh->b_kiobuf; 1734 if (atomic_dec_and_test(&kiobuf->io_count)) 1735 kiobuf->end_io(kiobuf); 1736 if (!uptodate) 1737 kiobuf->errno = -EIO; 1738 } 1739 1740 1741 /* 1742 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait 1743 * for them to complete. Clean up the buffer_heads afterwards. 1744 */ 1745 1746 static int do_kio(struct kiobuf *kiobuf, 1747 int rw, int nr, struct buffer_head *bh[], int size) 1748 { 1749 int iosize; 1750 int i; 1751 struct buffer_head *tmp; 1752 1753 struct task_struct *tsk = current; 1754 DECLARE_WAITQUEUE(wait, tsk); 1755 1756 if (rw == WRITE) 1757 rw = WRITERAW; 1758 atomic_add(nr, &kiobuf->io_count); 1759 kiobuf->errno = 0; 1760 ll_rw_block(rw, nr, bh); 1761 1762 kiobuf_wait_for_io(kiobuf); 1763 1764 spin_lock(&unused_list_lock); 1765 1766 iosize = 0; 1767 for (i = nr; --i >= 0; ) { 1768 iosize += size; 1769 tmp = bh[i]; 1770 if (!buffer_uptodate(tmp)) { 1771 /* We are traversing bh'es in reverse order so 1772 clearing iosize on error calculates the 1773 amount of IO before the first error. */ 1774 iosize = 0; 1775 } 1776 __put_unused_buffer_head(tmp); 1777 } 1778 1779 spin_unlock(&unused_list_lock); 1780 1781 if (iosize) 1782 return iosize; 1783 if (kiobuf->errno) 1784 return kiobuf->errno; 1785 return -EIO; 1786 } 1787 1788 /* 1789 * Start I/O on a physical range of kernel memory, defined by a vector 1790 * of kiobuf structs (much like a user-space iovec list). 1791 * 1792 * The kiobuf must already be locked for IO. IO is submitted 1793 * asynchronously: you need to check page->locked, page->uptodate, and 1794 * maybe wait on page->wait. 1795 * 1796 * It is up to the caller to make sure that there are enough blocks 1797 * passed in to completely map the iobufs to disk. 1798 */ 1799 1800 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 1801 kdev_t dev, unsigned long b[], int size) 1802 { 1803 int err; 1804 int length; 1805 int transferred; 1806 int i; 1807 int bufind; 1808 int pageind; 1809 int bhind; 1810 int offset; 1811 unsigned long blocknr; 1812 struct kiobuf * iobuf = NULL; 1813 struct page * map; 1814 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS]; 1815 1816 if (!nr) 1817 return 0; 1818 1819 /* 1820 * First, do some alignment and validity checks 1821 */ 1822 for (i = 0; i < nr; i++) { 1823 iobuf = iovec[i]; 1824 if ((iobuf->offset & (size-1)) || 1825 (iobuf->length & (size-1))) 1826 return -EINVAL; 1827 if (!iobuf->locked) 1828 panic("brw_kiovec: iobuf not locked for I/O"); 1829 if (!iobuf->nr_pages) 1830 panic("brw_kiovec: iobuf not initialised"); 1831 } 1832 1833 /* 1834 * OK to walk down the iovec doing page IO on each page we find. 1835 */ 1836 bufind = bhind = transferred = err = 0; 1837 for (i = 0; i < nr; i++) { 1838 iobuf = iovec[i]; 1839 offset = iobuf->offset; 1840 length = iobuf->length; 1841 1842 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { 1843 map = iobuf->maplist[pageind]; 1844 1845 while (length > 0) { 1846 blocknr = b[bufind++]; 1847 tmp = get_unused_buffer_head(0); 1848 if (!tmp) { 1849 err = -ENOMEM; 1850 goto error; 1851 } 1852 1853 tmp->b_dev = B_FREE; 1854 tmp->b_size = size; 1855 set_bh_page(tmp, map, offset); 1856 tmp->b_this_page = tmp; 1857 1858 init_buffer(tmp, end_buffer_io_kiobuf, NULL); 1859 tmp->b_dev = dev; 1860 tmp->b_blocknr = blocknr; 1861 tmp->b_state = 1 << BH_Mapped; 1862 tmp->b_kiobuf = iobuf; 1863 1864 if (rw == WRITE) { 1865 set_bit(BH_Uptodate, &tmp->b_state); 1866 set_bit(BH_Dirty, &tmp->b_state); 1867 } 1868 1869 bh[bhind++] = tmp; 1870 length -= size; 1871 offset += size; 1872 1873 /* 1874 * Start the IO if we have got too much 1875 */ 1876 if (bhind >= KIO_MAX_SECTORS) { 1877 err = do_kio(iobuf, rw, bhind, bh, size); 1878 if (err >= 0) 1879 transferred += err; 1880 else 1881 goto finished; 1882 bhind = 0; 1883 } 1884 1885 if (offset >= PAGE_SIZE) { 1886 offset = 0; 1887 break; 1888 } 1889 } /* End of block loop */ 1890 } /* End of page loop */ 1891 } /* End of iovec loop */ 1892 1893 /* Is there any IO still left to submit? */ 1894 if (bhind) { 1895 err = do_kio(iobuf, rw, bhind, bh, size); 1896 if (err >= 0) 1897 transferred += err; 1898 else 1899 goto finished; 1900 } 1901 1902 finished: 1903 if (transferred) 1904 return transferred; 1905 return err; 1906 1907 error: 1908 /* We got an error allocating the bh'es. Just free the current 1909 buffer_heads and exit. */ 1910 spin_lock(&unused_list_lock); 1911 for (i = bhind; --i >= 0; ) { 1912 __put_unused_buffer_head(bh[bhind]); 1913 } 1914 spin_unlock(&unused_list_lock); 1915 goto finished; 1916 } 1917 1918 /* 1919 * Start I/O on a page. 1920 * This function expects the page to be locked and may return 1921 * before I/O is complete. You then have to check page->locked, 1922 * page->uptodate, and maybe wait on page->wait. 1923 * 1924 * brw_page() is SMP-safe, although it's being called with the 1925 * kernel lock held - but the code is ready. 1926 * 1927 * FIXME: we need a swapper_inode->get_block function to remove 1928 * some of the bmap kludges and interface ugliness here. 1929 */ 1930 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) 1931 { 1932 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE]; 1933 int nr, fresh /* temporary debugging flag */, block; 1934 1935 if (!PageLocked(page)) 1936 panic("brw_page: page not locked for I/O"); 1937 // clear_bit(PG_error, &page->flags); 1938 /* 1939 * We pretty much rely on the page lock for this, because 1940 * create_page_buffers() might sleep. 1941 */ 1942 fresh = 0; 1943 if (!page->buffers) { 1944 create_page_buffers(rw, page, dev, b, size); 1945 fresh = 1; 1946 } 1947 if (!page->buffers) 1948 BUG(); 1949 1950 head = page->buffers; 1951 bh = head; 1952 nr = 0; 1953 do { 1954 block = *(b++); 1955 1956 if (fresh && (atomic_read(&bh->b_count) != 0)) 1957 BUG(); 1958 if (rw == READ) { 1959 if (!fresh) 1960 BUG(); 1961 if (!buffer_uptodate(bh)) { 1962 arr[nr++] = bh; 1963 atomic_inc(&bh->b_count); 1964 } 1965 } else { /* WRITE */ 1966 if (!bh->b_blocknr) { 1967 if (!block) 1968 BUG(); 1969 bh->b_blocknr = block; 1970 } else { 1971 if (!block) 1972 BUG(); 1973 } 1974 set_bit(BH_Uptodate, &bh->b_state); 1975 set_bit(BH_Dirty, &bh->b_state); 1976 arr[nr++] = bh; 1977 atomic_inc(&bh->b_count); 1978 } 1979 bh = bh->b_this_page; 1980 } while (bh != head); 1981 if (rw == READ) 1982 ++current->maj_flt; 1983 if ((rw == READ) && nr) { 1984 if (Page_Uptodate(page)) 1985 BUG(); 1986 ll_rw_block(rw, nr, arr); 1987 } else { 1988 if (!nr && rw == READ) { 1989 SetPageUptodate(page); 1990 UnlockPage(page); 1991 } 1992 if (nr && (rw == WRITE)) 1993 ll_rw_block(rw, nr, arr); 1994 } 1995 return 0; 1996 } 1997 1998 int block_symlink(struct inode *inode, const char *symname, int len) 1999 { 2000 struct address_space *mapping = inode->i_mapping; 2001 struct page *page = grab_cache_page(mapping, 0); 2002 int err = -ENOMEM; 2003 char *kaddr; 2004 2005 if (!page) 2006 goto fail; 2007 err = mapping->a_ops->prepare_write(page, 0, len-1); 2008 if (err) 2009 goto fail_map; 2010 kaddr = (char*)page_address(page); 2011 memcpy(kaddr, symname, len-1); 2012 mapping->a_ops->commit_write(NULL, page, 0, len-1); 2013 inode->i_size = len-1; 2014 /* 2015 * Notice that we are _not_ going to block here - end of page is 2016 * unmapped, so this will only try to map the rest of page, see 2017 * that it is unmapped (typically even will not look into inode - 2018 * ->i_size will be enough for everything) and zero it out. 2019 * OTOH it's obviously correct and should make the page up-to-date. 2020 */ 2021 err = mapping->a_ops->readpage(NULL, page); 2022 wait_on_page(page); 2023 page_cache_release(page); 2024 if (err < 0) 2025 goto fail; 2026 mark_inode_dirty(inode); 2027 return 0; 2028 fail_map: 2029 inode->i_size = len-1; 2030 UnlockPage(page); 2031 page_cache_release(page); 2032 fail: 2033 return err; 2034 } 2035 2036 /* 2037 * Try to increase the number of buffers available: the size argument 2038 * is used to determine what kind of buffers we want. 2039 */ 2040 static int grow_buffers(int size) 2041 { 2042 struct page * page; 2043 struct buffer_head *bh, *tmp; 2044 struct buffer_head * insert_point; 2045 int isize; 2046 2047 if ((size & 511) || (size > PAGE_SIZE)) { 2048 printk("VFS: grow_buffers: size = %d\n",size); 2049 return 0; 2050 } 2051 2052 page = alloc_page(GFP_BUFFER); 2053 if (!page) 2054 goto out; 2055 bh = create_buffers(page, size, 0); 2056 if (!bh) 2057 goto no_buffer_head; 2058 2059 isize = BUFSIZE_INDEX(size); 2060 2061 spin_lock(&free_list[isize].lock); 2062 insert_point = free_list[isize].list; 2063 tmp = bh; 2064 while (1) { 2065 if (insert_point) { 2066 tmp->b_next_free = insert_point->b_next_free; 2067 tmp->b_prev_free = insert_point; 2068 insert_point->b_next_free->b_prev_free = tmp; 2069 insert_point->b_next_free = tmp; 2070 } else { 2071 tmp->b_prev_free = tmp; 2072 tmp->b_next_free = tmp; 2073 } 2074 insert_point = tmp; 2075 if (tmp->b_this_page) 2076 tmp = tmp->b_this_page; 2077 else 2078 break; 2079 } 2080 tmp->b_this_page = bh; 2081 free_list[isize].list = bh; 2082 spin_unlock(&free_list[isize].lock); 2083 2084 page->buffers = bh; 2085 lru_cache_add(page); 2086 atomic_inc(&buffermem_pages); 2087 return 1; 2088 2089 no_buffer_head: 2090 __free_page(page); 2091 out: 2092 return 0; 2093 } 2094 2095 /* 2096 * Can the buffer be thrown out? 2097 */ 2098 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected)) 2099 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) 2100 2101 /* 2102 * try_to_free_buffers() checks if all the buffers on this particular page 2103 * are unused, and free's the page if so. 2104 * 2105 * Wake up bdflush() if this fails - if we're running low on memory due 2106 * to dirty buffers, we need to flush them out as quickly as possible. 2107 * 2108 * NOTE: There are quite a number of ways that threads of control can 2109 * obtain a reference to a buffer head within a page. So we must 2110 * lock out all of these paths to cleanly toss the page. 2111 */ 2112 int try_to_free_buffers(struct page * page) 2113 { 2114 struct buffer_head * tmp, * p, * bh = page->buffers; 2115 int index = BUFSIZE_INDEX(bh->b_size); 2116 int ret; 2117 2118 spin_lock(&lru_list_lock); 2119 write_lock(&hash_table_lock); 2120 spin_lock(&free_list[index].lock); 2121 tmp = bh; 2122 do { 2123 p = tmp; 2124 2125 tmp = tmp->b_this_page; 2126 if (buffer_busy(p)) 2127 goto busy_buffer_page; 2128 } while (tmp != bh); 2129 2130 spin_lock(&unused_list_lock); 2131 tmp = bh; 2132 do { 2133 struct buffer_head * p = tmp; 2134 tmp = tmp->b_this_page; 2135 2136 /* The buffer can be either on the regular 2137 * queues or on the free list.. 2138 */ 2139 if (p->b_dev != B_FREE) 2140 __remove_from_queues(p); 2141 else 2142 __remove_from_free_list(p, index); 2143 __put_unused_buffer_head(p); 2144 } while (tmp != bh); 2145 spin_unlock(&unused_list_lock); 2146 2147 /* Wake up anyone waiting for buffer heads */ 2148 wake_up(&buffer_wait); 2149 2150 /* And free the page */ 2151 page->buffers = NULL; 2152 __free_page(page); 2153 ret = 1; 2154 out: 2155 spin_unlock(&free_list[index].lock); 2156 write_unlock(&hash_table_lock); 2157 spin_unlock(&lru_list_lock); 2158 return ret; 2159 2160 busy_buffer_page: 2161 /* Uhhuh, start writeback so that we don't end up with all dirty pages */ 2162 if (buffer_dirty(p)) 2163 wakeup_bdflush(0); 2164 ret = 0; 2165 goto out; 2166 } 2167 2168 /* ================== Debugging =================== */ 2169 2170 void show_buffers(void) 2171 { 2172 #ifdef __SMP__ 2173 struct buffer_head * bh; 2174 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; 2175 int protected = 0; 2176 int nlist; 2177 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", }; 2178 #endif 2179 2180 printk("Buffer memory: %6dkB\n", 2181 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10)); 2182 2183 #ifdef __SMP__ /* trylock does nothing on UP and so we could deadlock */ 2184 if (!spin_trylock(&lru_list_lock)) 2185 return; 2186 for(nlist = 0; nlist < NR_LIST; nlist++) { 2187 found = locked = dirty = used = lastused = protected = 0; 2188 bh = lru_list[nlist]; 2189 if(!bh) continue; 2190 2191 do { 2192 found++; 2193 if (buffer_locked(bh)) 2194 locked++; 2195 if (buffer_protected(bh)) 2196 protected++; 2197 if (buffer_dirty(bh)) 2198 dirty++; 2199 if (atomic_read(&bh->b_count)) 2200 used++, lastused = found; 2201 bh = bh->b_next_free; 2202 } while (bh != lru_list[nlist]); 2203 { 2204 int tmp = nr_buffers_type[nlist]; 2205 if (found != tmp) 2206 printk("%9s: BUG -> found %d, reported %d\n", 2207 buf_types[nlist], found, tmp); 2208 } 2209 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), " 2210 "%d locked, %d protected, %d dirty\n", 2211 buf_types[nlist], found, size_buffers_type[nlist]>>10, 2212 used, lastused, locked, protected, dirty); 2213 } 2214 spin_unlock(&lru_list_lock); 2215 #endif 2216 } 2217 2218 /* ===================== Init ======================= */ 2219 2220 /* 2221 * allocate the hash table and init the free list 2222 * Use gfp() for the hash table to decrease TLB misses, use 2223 * SLAB cache for buffer heads. 2224 */ 2225 void __init buffer_init(unsigned long mempages) 2226 { 2227 int order, i; 2228 unsigned int nr_hash; 2229 2230 /* The buffer cache hash table is less important these days, 2231 * trim it a bit. 2232 */ 2233 mempages >>= 14; 2234 2235 mempages *= sizeof(struct buffer_head *); 2236 2237 for (order = 0; (1 << order) < mempages; order++) 2238 ; 2239 2240 /* try to allocate something until we get it or we're asking 2241 for something that is really too small */ 2242 2243 do { 2244 unsigned long tmp; 2245 2246 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *); 2247 bh_hash_mask = (nr_hash - 1); 2248 2249 tmp = nr_hash; 2250 bh_hash_shift = 0; 2251 while((tmp >>= 1UL) != 0UL) 2252 bh_hash_shift++; 2253 2254 hash_table = (struct buffer_head **) 2255 __get_free_pages(GFP_ATOMIC, order); 2256 } while (hash_table == NULL && --order > 0); 2257 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", 2258 nr_hash, order, (1UL<<order) * PAGE_SIZE); 2259 2260 if (!hash_table) 2261 panic("Failed to allocate buffer hash table\n"); 2262 2263 /* Setup hash chains. */ 2264 for(i = 0; i < nr_hash; i++) 2265 hash_table[i] = NULL; 2266 2267 /* Setup free lists. */ 2268 for(i = 0; i < NR_SIZES; i++) { 2269 free_list[i].list = NULL; 2270 free_list[i].lock = SPIN_LOCK_UNLOCKED; 2271 } 2272 2273 /* Setup lru lists. */ 2274 for(i = 0; i < NR_LIST; i++) 2275 lru_list[i] = NULL; 2276 2277 bh_cachep = kmem_cache_create("buffer_head", 2278 sizeof(struct buffer_head), 2279 0, 2280 SLAB_HWCACHE_ALIGN, NULL, NULL); 2281 if(!bh_cachep) 2282 panic("Cannot create buffer head SLAB cache\n"); 2283 } 2284 2285 2286 /* ====================== bdflush support =================== */ 2287 2288 /* This is a simple kernel daemon, whose job it is to provide a dynamic 2289 * response to dirty buffers. Once this process is activated, we write back 2290 * a limited number of buffers to the disks and then go back to sleep again. 2291 */ 2292 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done); 2293 struct task_struct *bdflush_tsk = 0; 2294 2295 void wakeup_bdflush(int block) 2296 { 2297 DECLARE_WAITQUEUE(wait, current); 2298 2299 if (current == bdflush_tsk) 2300 return; 2301 2302 if (!block) { 2303 if(bdflush_tsk != NULL) wake_up_process(bdflush_tsk); 2304 return; 2305 } 2306 2307 /* kflushd can wakeup us before we have a chance to 2308 go to sleep so we must be smart in handling 2309 this wakeup event from kflushd to avoid deadlocking in SMP 2310 (we are not holding any lock anymore in these two paths). */ 2311 __set_current_state(TASK_UNINTERRUPTIBLE); 2312 add_wait_queue(&bdflush_done, &wait); 2313 2314 wake_up_process(bdflush_tsk); 2315 schedule(); 2316 2317 remove_wait_queue(&bdflush_done, &wait); 2318 __set_current_state(TASK_RUNNING); 2319 } 2320 2321 /* This is the _only_ function that deals with flushing async writes 2322 to disk. 2323 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list 2324 as all dirty buffers lives _only_ in the DIRTY lru list. 2325 As we never browse the LOCKED and CLEAN lru lists they are infact 2326 completly useless. */ 2327 static int flush_dirty_buffers(int check_flushtime) 2328 { 2329 struct buffer_head * bh, *next; 2330 int flushed = 0, i; 2331 2332 restart: 2333 spin_lock(&lru_list_lock); 2334 bh = lru_list[BUF_DIRTY]; 2335 if (!bh) 2336 goto out_unlock; 2337 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) { 2338 next = bh->b_next_free; 2339 2340 if (!buffer_dirty(bh)) { 2341 __refile_buffer(bh); 2342 continue; 2343 } 2344 if (buffer_locked(bh)) 2345 continue; 2346 2347 if (check_flushtime) { 2348 /* The dirty lru list is chronologically ordered so 2349 if the current bh is not yet timed out, 2350 then also all the following bhs 2351 will be too young. */ 2352 if (time_before(jiffies, bh->b_flushtime)) 2353 goto out_unlock; 2354 } else { 2355 if (++flushed > bdf_prm.b_un.ndirty) 2356 goto out_unlock; 2357 } 2358 2359 /* OK, now we are committed to write it out. */ 2360 atomic_inc(&bh->b_count); 2361 spin_unlock(&lru_list_lock); 2362 ll_rw_block(WRITE, 1, &bh); 2363 atomic_dec(&bh->b_count); 2364 2365 if (current->need_resched) 2366 schedule(); 2367 goto restart; 2368 } 2369 out_unlock: 2370 spin_unlock(&lru_list_lock); 2371 2372 return flushed; 2373 } 2374 2375 /* 2376 * Here we attempt to write back old buffers. We also try to flush inodes 2377 * and supers as well, since this function is essentially "update", and 2378 * otherwise there would be no way of ensuring that these quantities ever 2379 * get written back. Ideally, we would have a timestamp on the inodes 2380 * and superblocks so that we could write back only the old ones as well 2381 */ 2382 2383 static int sync_old_buffers(void) 2384 { 2385 lock_kernel(); 2386 sync_supers(0); 2387 sync_inodes(0); 2388 unlock_kernel(); 2389 2390 flush_dirty_buffers(1); 2391 /* must really sync all the active I/O request to disk here */ 2392 run_task_queue(&tq_disk); 2393 return 0; 2394 } 2395 2396 /* This is the interface to bdflush. As we get more sophisticated, we can 2397 * pass tuning parameters to this "process", to adjust how it behaves. 2398 * We would want to verify each parameter, however, to make sure that it 2399 * is reasonable. */ 2400 2401 asmlinkage long sys_bdflush(int func, long data) 2402 { 2403 if (!capable(CAP_SYS_ADMIN)) 2404 return -EPERM; 2405 2406 if (func == 1) { 2407 /* do_exit directly and let kupdate to do its work alone. */ 2408 do_exit(0); 2409 #if 0 /* left here as it's the only example of lazy-mm-stuff used from 2410 a syscall that doesn't care about the current mm context. */ 2411 int error; 2412 struct mm_struct *user_mm; 2413 2414 /* 2415 * bdflush will spend all of it's time in kernel-space, 2416 * without touching user-space, so we can switch it into 2417 * 'lazy TLB mode' to reduce the cost of context-switches 2418 * to and from bdflush. 2419 */ 2420 user_mm = start_lazy_tlb(); 2421 error = sync_old_buffers(); 2422 end_lazy_tlb(user_mm); 2423 return error; 2424 #endif 2425 } 2426 2427 /* Basically func 1 means read param 1, 2 means write param 1, etc */ 2428 if (func >= 2) { 2429 int i = (func-2) >> 1; 2430 if (i >= 0 && i < N_PARAM) { 2431 if ((func & 1) == 0) 2432 return put_user(bdf_prm.data[i], (int*)data); 2433 2434 if (data >= bdflush_min[i] && data <= bdflush_max[i]) { 2435 bdf_prm.data[i] = data; 2436 return 0; 2437 } 2438 } 2439 return -EINVAL; 2440 } 2441 2442 /* Having func 0 used to launch the actual bdflush and then never 2443 * return (unless explicitly killed). We return zero here to 2444 * remain semi-compatible with present update(8) programs. 2445 */ 2446 return 0; 2447 } 2448 2449 /* 2450 * This is the actual bdflush daemon itself. It used to be started from 2451 * the syscall above, but now we launch it ourselves internally with 2452 * kernel_thread(...) directly after the first thread in init/main.c 2453 */ 2454 int bdflush(void * unused) 2455 { 2456 int flushed; 2457 /* 2458 * We have a bare-bones task_struct, and really should fill 2459 * in a few more things so "top" and /proc/2/{exe,root,cwd} 2460 * display semi-sane things. Not real crucial though... 2461 */ 2462 2463 current->session = 1; 2464 current->pgrp = 1; 2465 sprintf(current->comm, "kflushd"); 2466 bdflush_tsk = current; 2467 2468 /* avoid getting signals */ 2469 spin_lock_irq(&current->sigmask_lock); 2470 flush_signals(current); 2471 sigfillset(&current->blocked); 2472 recalc_sigpending(current); 2473 spin_unlock_irq(&current->sigmask_lock); 2474 2475 for (;;) { 2476 CHECK_EMERGENCY_SYNC 2477 2478 flushed = flush_dirty_buffers(0); 2479 2480 /* If wakeup_bdflush will wakeup us 2481 after our bdflush_done wakeup, then 2482 we must make sure to not sleep 2483 in schedule_timeout otherwise 2484 wakeup_bdflush may wait for our 2485 bdflush_done wakeup that would never arrive 2486 (as we would be sleeping) and so it would 2487 deadlock in SMP. */ 2488 __set_current_state(TASK_INTERRUPTIBLE); 2489 wake_up(&bdflush_done); 2490 /* 2491 * If there are still a lot of dirty buffers around, 2492 * skip the sleep and flush some more. Otherwise, we 2493 * go to sleep waiting a wakeup. 2494 */ 2495 if (!flushed || balance_dirty_state(NODEV) < 0) 2496 schedule(); 2497 /* Remember to mark us as running otherwise 2498 the next schedule will block. */ 2499 __set_current_state(TASK_RUNNING); 2500 } 2501 } 2502 2503 /* 2504 * This is the kernel update daemon. It was used to live in userspace 2505 * but since it's need to run safely we want it unkillable by mistake. 2506 * You don't need to change your userspace configuration since 2507 * the userspace `update` will do_exit(0) at the first sys_bdflush(). 2508 */ 2509 int kupdate(void * unused) 2510 { 2511 struct task_struct * tsk = current; 2512 int interval; 2513 2514 tsk->session = 1; 2515 tsk->pgrp = 1; 2516 strcpy(tsk->comm, "kupdate"); 2517 2518 /* sigstop and sigcont will stop and wakeup kupdate */ 2519 spin_lock_irq(&tsk->sigmask_lock); 2520 sigfillset(&tsk->blocked); 2521 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP)); 2522 recalc_sigpending(tsk); 2523 spin_unlock_irq(&tsk->sigmask_lock); 2524 2525 for (;;) { 2526 /* update interval */ 2527 interval = bdf_prm.b_un.interval; 2528 if (interval) { 2529 tsk->state = TASK_INTERRUPTIBLE; 2530 schedule_timeout(interval); 2531 } else { 2532 stop_kupdate: 2533 tsk->state = TASK_STOPPED; 2534 schedule(); /* wait for SIGCONT */ 2535 } 2536 /* check for sigstop */ 2537 if (signal_pending(tsk)) { 2538 int stopped = 0; 2539 spin_lock_irq(&tsk->sigmask_lock); 2540 if (sigismember(&tsk->signal, SIGSTOP)) { 2541 sigdelset(&tsk->signal, SIGSTOP); 2542 stopped = 1; 2543 } 2544 recalc_sigpending(tsk); 2545 spin_unlock_irq(&tsk->sigmask_lock); 2546 if (stopped) 2547 goto stop_kupdate; 2548 } 2549 #ifdef DEBUG 2550 printk("kupdate() activated...\n"); 2551 #endif 2552 sync_old_buffers(); 2553 } 2554 } 2555 2556 static int __init bdflush_init(void) 2557 { 2558 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); 2559 kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); 2560 return 0; 2561 } 2562 2563 module_init(bdflush_init) 2564 2565

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.