fs/buffer.c

Name: Public Git Hosting - davej-history.git/blob - fs/buffer.c
Rating: 4.9 (8816 reviews)
 1 /*
 2  * linux/fs/buffer.c
 3  *
 4  * Copyright (C) 1991, 1992 Linus Torvalds
 5  */
 6
 7 /*
 8  * 'buffer.c' implements the buffer-cache functions. Race-conditions have
 9  * been avoided by NEVER letting an interrupt change a buffer (except for the
 10  * data, of course), but instead letting the caller do it.
 11  */
 12
 13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
 14
 15 /* Removed a lot of unnecessary code and simplified things now that
 16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 17  */
 18
 19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
 20  * hash table, use SLAB cache for buffer heads. -DaveM
 21  */
 22
 23 /* Added 32k buffer block sizes - these are required older ARM systems.
 24  * - RMK
 25  */
 26
 27 /* Thread it... -DaveM */
 28
 29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
 30
 31 #include <linux/config.h>
 32 #include <linux/sched.h>
 33 #include <linux/fs.h>
 34 #include <linux/malloc.h>
 35 #include <linux/locks.h>
 36 #include <linux/errno.h>
 37 #include <linux/swap.h>
 38 #include <linux/smp_lock.h>
 39 #include <linux/vmalloc.h>
 40 #include <linux/blkdev.h>
 41 #include <linux/sysrq.h>
 42 #include <linux/file.h>
 43 #include <linux/init.h>
 44 #include <linux/quotaops.h>
 45 #include <linux/iobuf.h>
 46 #include <linux/highmem.h>
 47
 48 #include <asm/uaccess.h>
 49 #include <asm/io.h>
 50 #include <asm/bitops.h>
 51 #include <asm/mmu_context.h>
 52
 53 #define NR_SIZES 7
 54 static char buffersize_index[65] =
 55 {-1,0,1, -1,2, -1, -1, -1,3, -1, -1, -1, -1, -1, -1, -1,
 56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 59 6};
 60
 61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
 62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
 63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
 64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20/* don't ever have more than this
 65  number of unused buffer heads */
 66
 67 /* Anti-deadlock ordering:
 68  * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
 69  */
 70
 71 /*
 72  * Hash table gook..
 73  */
 74 static unsigned int bh_hash_mask;
 75 static unsigned int bh_hash_shift;
 76 static struct buffer_head **hash_table;
 77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
 78
 79 static struct buffer_head *lru_list[NR_LIST];
 80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
 81 static int nr_buffers_type[NR_LIST];
 82 static unsigned long size_buffers_type[NR_LIST];
 83
 84 static struct buffer_head * unused_list;
 85 static int nr_unused_buffer_heads;
 86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
 87 staticDECLARE_WAIT_QUEUE_HEAD(buffer_wait);
 88
 89 struct bh_free_head {
 90 struct buffer_head *list;
 91  spinlock_t lock;
 92 };
 93 static struct bh_free_head free_list[NR_SIZES];
 94
 95 static intgrow_buffers(int size);
 96 static void__refile_buffer(struct buffer_head *);
 97
 98 /* This is used by some architectures to estimate available memory. */
 99 atomic_t buffermem_pages =ATOMIC_INIT(0);
 100
 101 /* Here is the parameter block for the bdflush process. If you add or
 102  * remove any of the parameters, make sure to update kernel/sysctl.c.
 103  */
 104
 105 #define N_PARAM 9
 106
 107 /* The dummy values in this structure are left in there for compatibility
 108  * with old programs that play with the /proc entries.
 109  */
 110 union bdflush_param {
 111 struct{
 112 int nfract;/* Percentage of buffer cache dirty to
 113  activate bdflush */
 114 int ndirty;/* Maximum number of dirty blocks to write out per
 115  wake-cycle */
 116 int nrefill;/* Number of clean buffers to try to obtain
 117  each time we call refill */
 118 int nref_dirt;/* Dirty buffer threshold for activating bdflush
 119  when trying to refill buffers. */
 120 int interval;/* jiffies delay between kupdate flushes */
 121 int age_buffer;/* Time for normal buffer to age before we flush it */
 122 int dummy1;/* unused, was age_super */
 123 int dummy2;/* unused */
 124 int dummy3;/* unused */
 125 } b_un;
 126 unsigned int data[N_PARAM];
 127 } bdf_prm = {{40,500,64,256,5*HZ,30*HZ,5*HZ,1884,2}};
 128
 129 /* These are the min and max parameter values that we will allow to be assigned */
 130 int bdflush_min[N_PARAM] = {0,10,5,25,0,1*HZ,1*HZ,1,1};
 131 int bdflush_max[N_PARAM] = {100,50000,20000,20000,600*HZ,6000*HZ,6000*HZ,2047,5};
 132
 133 /*
 134  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 135  * and getting rid of the cli-sti pairs. The wait-queue routines still
 136  * need cli-sti, but now it's just a couple of 386 instructions or so.
 137  *
 138  * Note that the real wait_on_buffer() is an inline function that checks
 139  * if 'b_wait' is set before calling this, so that the queues aren't set
 140  * up unnecessarily.
 141  */
 142 void__wait_on_buffer(struct buffer_head * bh)
 143 {
 144 struct task_struct *tsk = current;
 145 DECLARE_WAITQUEUE(wait, tsk);
 146
 147 atomic_inc(&bh->b_count);
 148 add_wait_queue(&bh->b_wait, &wait);
 149 do{
 150 run_task_queue(&tq_disk);
 151 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 152 if(!buffer_locked(bh))
 153 break;
 154 schedule();
 155 }while(buffer_locked(bh));
 156  tsk->state = TASK_RUNNING;
 157 remove_wait_queue(&bh->b_wait, &wait);
 158 atomic_dec(&bh->b_count);
 159 }
 160
 161 /* Call sync_buffers with wait!=0 to ensure that the call does not
 162  * return until all buffer writes have completed. Sync() may return
 163  * before the writes have finished; fsync() may not.
 164  */
 165
 166 /* Godamity-damn. Some buffers (bitmaps for filesystems)
 167  * spontaneously dirty themselves without ever brelse being called.
 168  * We will ultimately want to put these in a separate list, but for
 169  * now we search all of the lists for dirty buffers.
 170  */
 171 static intsync_buffers(kdev_t dev,int wait)
 172 {
 173 int i, retry, pass =0, err =0;
 174 struct buffer_head * bh, *next;
 175
 176 /* One pass for no-wait, three for wait:
 177  * 0) write out all dirty, unlocked buffers;
 178  * 1) write out all dirty buffers, waiting if locked;
 179  * 2) wait for completion by waiting for all buffers to unlock.
 180  */
 181 do{
 182  retry =0;
 183
 184 /* We search all lists as a failsafe mechanism, not because we expect
 185  * there to be dirty buffers on any of the other lists.
 186  */
 187 repeat:
 188 spin_lock(&lru_list_lock);
 189  bh = lru_list[BUF_DIRTY];
 190 if(!bh)
 191 goto repeat2;
 192
 193 for(i = nr_buffers_type[BUF_DIRTY]*2; i-- >0; bh = next) {
 194  next = bh->b_next_free;
 195
 196 if(!lru_list[BUF_DIRTY])
 197 break;
 198 if(dev && bh->b_dev != dev)
 199 continue;
 200 if(buffer_locked(bh)) {
 201 /* Buffer is locked; skip it unless wait is
 202  * requested AND pass > 0.
 203  */
 204 if(!wait || !pass) {
 205  retry =1;
 206 continue;
 207 }
 208 atomic_inc(&bh->b_count);
 209 spin_unlock(&lru_list_lock);
 210 wait_on_buffer(bh);
 211 atomic_dec(&bh->b_count);
 212 goto repeat;
 213 }
 214
 215 /* If an unlocked buffer is not uptodate, there has
 216  * been an IO error. Skip it.
 217  */
 218 if(wait &&buffer_req(bh) && !buffer_locked(bh) &&
 219 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 220  err = -EIO;
 221 continue;
 222 }
 223
 224 /* Don't write clean buffers. Don't write ANY buffers
 225  * on the third pass.
 226  */
 227 if(!buffer_dirty(bh) || pass >=2)
 228 continue;
 229
 230 atomic_inc(&bh->b_count);
 231 spin_unlock(&lru_list_lock);
 232 ll_rw_block(WRITE,1, &bh);
 233 atomic_dec(&bh->b_count);
 234  retry =1;
 235 goto repeat;
 236 }
 237
 238  repeat2:
 239  bh = lru_list[BUF_LOCKED];
 240 if(!bh) {
 241 spin_unlock(&lru_list_lock);
 242 break;
 243 }
 244 for(i = nr_buffers_type[BUF_LOCKED]*2; i-- >0; bh = next) {
 245  next = bh->b_next_free;
 246
 247 if(!lru_list[BUF_LOCKED])
 248 break;
 249 if(dev && bh->b_dev != dev)
 250 continue;
 251 if(buffer_locked(bh)) {
 252 /* Buffer is locked; skip it unless wait is
 253  * requested AND pass > 0.
 254  */
 255 if(!wait || !pass) {
 256  retry =1;
 257 continue;
 258 }
 259 atomic_inc(&bh->b_count);
 260 spin_unlock(&lru_list_lock);
 261 wait_on_buffer(bh);
 262 spin_lock(&lru_list_lock);
 263 atomic_dec(&bh->b_count);
 264 goto repeat2;
 265 }
 266 }
 267 spin_unlock(&lru_list_lock);
 268
 269 /* If we are waiting for the sync to succeed, and if any dirty
 270  * blocks were written, then repeat; on the second pass, only
 271  * wait for buffers being written (do not pass to write any
 272  * more buffers on the second pass).
 273  */
 274 }while(wait && retry && ++pass<=2);
 275 return err;
 276 }
 277
 278 voidsync_dev(kdev_t dev)
 279 {
 280 sync_supers(dev);
 281 sync_inodes(dev);
 282 DQUOT_SYNC(dev);
 283 /* sync all the dirty buffers out to disk only _after_ all the
 284  high level layers finished generated buffer dirty data
 285  (or we'll return with some buffer still dirty on the blockdevice
 286  so breaking the semantics of this call) */
 287 sync_buffers(dev,0);
 288 /*
 289  * FIXME(eric) we need to sync the physical devices here.
 290  * This is because some (scsi) controllers have huge amounts of
 291  * cache onboard (hundreds of Mb), and we need to instruct
 292  * them to commit all of the dirty memory to disk, and we should
 293  * not return until this has happened.
 294  *
 295  * This would need to get implemented by going through the assorted
 296  * layers so that each block major number can be synced, and this
 297  * would call down into the upper and mid-layer scsi.
 298  */
 299 }
 300
 301 intfsync_dev(kdev_t dev)
 302 {
 303 sync_buffers(dev,0);
 304
 305 lock_kernel();
 306 sync_supers(dev);
 307 sync_inodes(dev);
 308 DQUOT_SYNC(dev);
 309 unlock_kernel();
 310
 311 returnsync_buffers(dev,1);
 312 }
 313
 314 asmlinkage longsys_sync(void)
 315 {
 316 fsync_dev(0);
 317 return0;
 318 }
 319
 320 /*
 321  * filp may be NULL if called via the msync of a vma.
 322  */
 323
 324 intfile_fsync(struct file *filp,struct dentry *dentry,int datasync)
 325 {
 326 struct inode * inode = dentry->d_inode;
 327 struct super_block * sb;
 328  kdev_t dev;
 329 int ret;
 330
 331 lock_kernel();
 332 /* sync the inode to buffers */
 333 write_inode_now(inode,0);
 334
 335 /* sync the superblock to buffers */
 336  sb = inode->i_sb;
 337 wait_on_super(sb);
 338 if(sb->s_op && sb->s_op->write_super)
 339  sb->s_op->write_super(sb);
 340
 341 /* .. finally sync the buffers to disk */
 342  dev = inode->i_dev;
 343  ret =sync_buffers(dev,1);
 344 unlock_kernel();
 345 return ret;
 346 }
 347
 348 asmlinkage longsys_fsync(unsigned int fd)
 349 {
 350 struct file * file;
 351 struct dentry * dentry;
 352 struct inode * inode;
 353 int err;
 354
 355  err = -EBADF;
 356  file =fget(fd);
 357 if(!file)
 358 goto out;
 359
 360  dentry = file->f_dentry;
 361  inode = dentry->d_inode;
 362
 363  err = -EINVAL;
 364 if(!file->f_op || !file->f_op->fsync)
 365 goto out_putf;
 366
 367 /* We need to protect against concurrent writers.. */
 368 down(&inode->i_sem);
 369  err = file->f_op->fsync(file, dentry,0);
 370 up(&inode->i_sem);
 371
 372 out_putf:
 373 fput(file);
 374 out:
 375 return err;
 376 }
 377
 378 asmlinkage longsys_fdatasync(unsigned int fd)
 379 {
 380 struct file * file;
 381 struct dentry * dentry;
 382 struct inode * inode;
 383 int err;
 384
 385  err = -EBADF;
 386  file =fget(fd);
 387 if(!file)
 388 goto out;
 389
 390  dentry = file->f_dentry;
 391  inode = dentry->d_inode;
 392
 393  err = -EINVAL;
 394 if(!file->f_op || !file->f_op->fsync)
 395 goto out_putf;
 396
 397 down(&inode->i_sem);
 398  err = file->f_op->fsync(file, dentry,1);
 399 up(&inode->i_sem);
 400
 401 out_putf:
 402 fput(file);
 403 out:
 404 return err;
 405 }
 406
 407 /* After several hours of tedious analysis, the following hash
 408  * function won. Do not mess with it... -DaveM
 409  */
 410 #define _hashfn(dev,block) \
 411  ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 412  (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 414
 415 static __inline__ void__hash_link(struct buffer_head *bh,struct buffer_head **head)
 416 {
 417 if((bh->b_next = *head) != NULL)
 418  bh->b_next->b_pprev = &bh->b_next;
 419 *head = bh;
 420  bh->b_pprev = head;
 421 }
 422
 423 static __inline__ void__hash_unlink(struct buffer_head *bh)
 424 {
 425 if(bh->b_pprev) {
 426 if(bh->b_next)
 427  bh->b_next->b_pprev = bh->b_pprev;
 428 *(bh->b_pprev) = bh->b_next;
 429  bh->b_pprev = NULL;
 430 }
 431 }
 432
 433 static void__insert_into_lru_list(struct buffer_head * bh,int blist)
 434 {
 435 struct buffer_head **bhp = &lru_list[blist];
 436
 437 if(!*bhp) {
 438 *bhp = bh;
 439  bh->b_prev_free = bh;
 440 }
 441  bh->b_next_free = *bhp;
 442  bh->b_prev_free = (*bhp)->b_prev_free;
 443 (*bhp)->b_prev_free->b_next_free = bh;
 444 (*bhp)->b_prev_free = bh;
 445  nr_buffers_type[blist]++;
 446  size_buffers_type[blist] += bh->b_size;
 447 }
 448
 449 static void__remove_from_lru_list(struct buffer_head * bh,int blist)
 450 {
 451 if(bh->b_prev_free || bh->b_next_free) {
 452  bh->b_prev_free->b_next_free = bh->b_next_free;
 453  bh->b_next_free->b_prev_free = bh->b_prev_free;
 454 if(lru_list[blist] == bh)
 455  lru_list[blist] = bh->b_next_free;
 456 if(lru_list[blist] == bh)
 457  lru_list[blist] = NULL;
 458  bh->b_next_free = bh->b_prev_free = NULL;
 459  nr_buffers_type[blist]--;
 460  size_buffers_type[blist] -= bh->b_size;
 461 }
 462 }
 463
 464 static void__remove_from_free_list(struct buffer_head * bh,int index)
 465 {
 466 if(bh->b_next_free == bh)
 467  free_list[index].list = NULL;
 468 else{
 469  bh->b_prev_free->b_next_free = bh->b_next_free;
 470  bh->b_next_free->b_prev_free = bh->b_prev_free;
 471 if(free_list[index].list == bh)
 472  free_list[index].list = bh->b_next_free;
 473 }
 474  bh->b_next_free = bh->b_prev_free = NULL;
 475 }
 476
 477 /* must be called with both the hash_table_lock and the lru_list_lock
 478  held */
 479 static void__remove_from_queues(struct buffer_head *bh)
 480 {
 481 __hash_unlink(bh);
 482 __remove_from_lru_list(bh, bh->b_list);
 483 }
 484
 485 static void__insert_into_queues(struct buffer_head *bh)
 486 {
 487 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 488
 489 __hash_link(bh, head);
 490 __insert_into_lru_list(bh, bh->b_list);
 491 }
 492
 493 /* This function must only run if there are no other
 494  * references _anywhere_ to this buffer head.
 495  */
 496 static voidput_last_free(struct buffer_head * bh)
 497 {
 498 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 499 struct buffer_head **bhp = &head->list;
 500
 501  bh->b_state =0;
 502
 503 spin_lock(&head->lock);
 504  bh->b_dev = B_FREE;
 505 if(!*bhp) {
 506 *bhp = bh;
 507  bh->b_prev_free = bh;
 508 }
 509  bh->b_next_free = *bhp;
 510  bh->b_prev_free = (*bhp)->b_prev_free;
 511 (*bhp)->b_prev_free->b_next_free = bh;
 512 (*bhp)->b_prev_free = bh;
 513 spin_unlock(&head->lock);
 514 }
 515
 516 /*
 517  * Why like this, I hear you say... The reason is race-conditions.
 518  * As we don't lock buffers (unless we are reading them, that is),
 519  * something might happen to it while we sleep (ie a read-error
 520  * will force it bad). This shouldn't really happen currently, but
 521  * the code is ready.
 522  */
 523 staticinlinestruct buffer_head *__get_hash_table(kdev_t dev,int block,int size)
 524 {
 525 struct buffer_head *bh =hash(dev, block);
 526
 527 for(; bh; bh = bh->b_next)
 528 if(bh->b_blocknr == block &&
 529  bh->b_size == size &&
 530  bh->b_dev == dev)
 531 break;
 532 if(bh)
 533 atomic_inc(&bh->b_count);
 534
 535 return bh;
 536 }
 537
 538 struct buffer_head *get_hash_table(kdev_t dev,int block,int size)
 539 {
 540 struct buffer_head *bh;
 541
 542 read_lock(&hash_table_lock);
 543  bh =__get_hash_table(dev, block, size);
 544 read_unlock(&hash_table_lock);
 545
 546 return bh;
 547 }
 548
 549 unsigned intget_hardblocksize(kdev_t dev)
 550 {
 551 /*
 552  * Get the hard sector size for the given device. If we don't know
 553  * what it is, return 0.
 554  */
 555 if(hardsect_size[MAJOR(dev)] != NULL) {
 556 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 557 if(blksize !=0)
 558 return blksize;
 559 }
 560
 561 /*
 562  * We don't know what the hardware sector size for this device is.
 563  * Return 0 indicating that we don't know.
 564  */
 565 return0;
 566 }
 567
 568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 569  of fs corruption is going on. Trashing dirty data always imply losing
 570  information that was supposed to be just stored on the physical layer
 571  by the user.
 572
 573  Thus invalidate_buffers in general usage is not allwowed to trash dirty
 574  buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 575
 576  NOTE: In the case where the user removed a removable-media-disk even if
 577  there's still dirty data not synced on disk (due a bug in the device driver
 578  or due an error of the user), by not destroying the dirty buffers we could
 579  generate corruption also on the next media inserted, thus a parameter is
 580  necessary to handle this case in the most safe way possible (trying
 581  to not corrupt also the new disk inserted with the data belonging to
 582  the old now corrupted disk). Also for the ramdisk the natural thing
 583  to do in order to release the ramdisk memory is to destroy dirty buffers.
 584
 585  These are two special cases. Normal usage imply the device driver
 586  to issue a sync on the device (without waiting I/O completation) and
 587  then an invalidate_buffers call that doesn't trashes dirty buffers. */
 588 void__invalidate_buffers(kdev_t dev,int destroy_dirty_buffers)
 589 {
 590 int i, nlist, slept;
 591 struct buffer_head * bh, * bh_next;
 592
 593  retry:
 594  slept =0;
 595 spin_lock(&lru_list_lock);
 596 for(nlist =0; nlist < NR_LIST; nlist++) {
 597  bh = lru_list[nlist];
 598 if(!bh)
 599 continue;
 600 for(i = nr_buffers_type[nlist]; i >0; bh = bh_next, i--) {
 601  bh_next = bh->b_next_free;
 602 if(bh->b_dev != dev)
 603 continue;
 604 if(buffer_locked(bh)) {
 605 atomic_inc(&bh->b_count);
 606 spin_unlock(&lru_list_lock);
 607 wait_on_buffer(bh);
 608  slept =1;
 609 spin_lock(&lru_list_lock);
 610 atomic_dec(&bh->b_count);
 611 }
 612
 613 write_lock(&hash_table_lock);
 614 if(!atomic_read(&bh->b_count) &&
 615 (destroy_dirty_buffers || !buffer_dirty(bh))) {
 616 __remove_from_queues(bh);
 617 put_last_free(bh);
 618 }
 619 write_unlock(&hash_table_lock);
 620 if(slept)
 621 goto out;
 622 }
 623 }
 624 out:
 625 spin_unlock(&lru_list_lock);
 626 if(slept)
 627 goto retry;
 628 }
 629
 630 voidset_blocksize(kdev_t dev,int size)
 631 {
 632 externint*blksize_size[];
 633 int i, nlist, slept;
 634 struct buffer_head * bh, * bh_next;
 635
 636 if(!blksize_size[MAJOR(dev)])
 637 return;
 638
 639 /* Size must be a power of two, and between 512 and PAGE_SIZE */
 640 if(size > PAGE_SIZE || size <512|| (size & (size-1)))
 641 panic("Invalid blocksize passed to set_blocksize");
 642
 643 if(blksize_size[MAJOR(dev)][MINOR(dev)] ==0&& size == BLOCK_SIZE) {
 644  blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 645 return;
 646 }
 647 if(blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 648 return;
 649 sync_buffers(dev,2);
 650  blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 651
 652  retry:
 653  slept =0;
 654 spin_lock(&lru_list_lock);
 655 for(nlist =0; nlist < NR_LIST; nlist++) {
 656  bh = lru_list[nlist];
 657 if(!bh)
 658 continue;
 659 for(i = nr_buffers_type[nlist]; i >0; bh = bh_next, i--) {
 660  bh_next = bh->b_next_free;
 661 if(bh->b_dev != dev || bh->b_size == size)
 662 continue;
 663 if(buffer_locked(bh)) {
 664 atomic_inc(&bh->b_count);
 665 spin_unlock(&lru_list_lock);
 666 wait_on_buffer(bh);
 667  slept =1;
 668 spin_lock(&lru_list_lock);
 669 atomic_dec(&bh->b_count);
 670 }
 671
 672 write_lock(&hash_table_lock);
 673 if(!atomic_read(&bh->b_count)) {
 674 if(buffer_dirty(bh))
 675 printk(KERN_WARNING
 676 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 677 kdevname(dev), bh->b_blocknr, bh->b_size);
 678 __remove_from_queues(bh);
 679 put_last_free(bh);
 680 }else{
 681 if(atomic_set_buffer_clean(bh))
 682 __refile_buffer(bh);
 683 clear_bit(BH_Uptodate, &bh->b_state);
 684 printk(KERN_WARNING
 685 "set_blocksize: "
 686 "b_count %d, dev %s, block %lu, from %p\n",
 687 atomic_read(&bh->b_count),bdevname(bh->b_dev),
 688  bh->b_blocknr,__builtin_return_address(0));
 689 }
 690 write_unlock(&hash_table_lock);
 691 if(slept)
 692 goto out;
 693 }
 694 }
 695  out:
 696 spin_unlock(&lru_list_lock);
 697 if(slept)
 698 goto retry;
 699 }
 700
 701 /*
 702  * We used to try various strange things. Let's not.
 703  */
 704 static voidrefill_freelist(int size)
 705 {
 706 if(!grow_buffers(size)) {
 707 wakeup_bdflush(1);
 708  current->policy |= SCHED_YIELD;
 709 schedule();
 710 }
 711 }
 712
 713 voidinit_buffer(struct buffer_head *bh, bh_end_io_t *handler,void*private)
 714 {
 715  bh->b_list = BUF_CLEAN;
 716  bh->b_end_io = handler;
 717  bh->b_private =private;
 718 }
 719
 720 static voidend_buffer_io_sync(struct buffer_head *bh,int uptodate)
 721 {
 722 mark_buffer_uptodate(bh, uptodate);
 723 unlock_buffer(bh);
 724 }
 725
 726 static voidend_buffer_io_bad(struct buffer_head *bh,int uptodate)
 727 {
 728 mark_buffer_uptodate(bh, uptodate);
 729 unlock_buffer(bh);
 730 BUG();
 731 }
 732
 733 static voidend_buffer_io_async(struct buffer_head * bh,int uptodate)
 734 {
 735 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 736 unsigned long flags;
 737 struct buffer_head *tmp;
 738 struct page *page;
 739
 740 mark_buffer_uptodate(bh, uptodate);
 741
 742 /* This is a temporary buffer used for page I/O. */
 743  page = bh->b_page;
 744
 745 if(!uptodate)
 746 SetPageError(page);
 747
 748 /*
 749  * Be _very_ careful from here on. Bad things can happen if
 750  * two buffer heads end IO at almost the same time and both
 751  * decide that the page is now completely done.
 752  *
 753  * Async buffer_heads are here only as labels for IO, and get
 754  * thrown away once the IO for this page is complete. IO is
 755  * deemed complete once all buffers have been visited
 756  * (b_count==0) and are now unlocked. We must make sure that
 757  * only the _last_ buffer that decrements its count is the one
 758  * that unlock the page..
 759  */
 760 spin_lock_irqsave(&page_uptodate_lock, flags);
 761 unlock_buffer(bh);
 762 atomic_dec(&bh->b_count);
 763  tmp = bh->b_this_page;
 764 while(tmp != bh) {
 765 if(tmp->b_end_io == end_buffer_io_async &&buffer_locked(tmp))
 766 goto still_busy;
 767  tmp = tmp->b_this_page;
 768 }
 769
 770 /* OK, the async IO on this page is complete. */
 771 spin_unlock_irqrestore(&page_uptodate_lock, flags);
 772
 773 /*
 774  * if none of the buffers had errors then we can set the
 775  * page uptodate:
 776  */
 777 if(!PageError(page))
 778 SetPageUptodate(page);
 779
 780 /*
 781  * Run the hooks that have to be done when a page I/O has completed.
 782  */
 783 if(PageTestandClearDecrAfter(page))
 784 atomic_dec(&nr_async_pages);
 785
 786 UnlockPage(page);
 787
 788 return;
 789
 790 still_busy:
 791 spin_unlock_irqrestore(&page_uptodate_lock, flags);
 792 return;
 793 }
 794
 795 /*
 796  * Ok, this is getblk, and it isn't very clear, again to hinder
 797  * race-conditions. Most of the code is seldom used, (ie repeating),
 798  * so it should be much more efficient than it looks.
 799  *
 800  * The algorithm is changed: hopefully better, and an elusive bug removed.
 801  *
 802  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 803  * when the filesystem starts to get full of dirty blocks (I hope).
 804  */
 805 struct buffer_head *getblk(kdev_t dev,int block,int size)
 806 {
 807 struct buffer_head * bh;
 808 int isize;
 809
 810 repeat:
 811 spin_lock(&lru_list_lock);
 812 write_lock(&hash_table_lock);
 813  bh =__get_hash_table(dev, block, size);
 814 if(bh)
 815 goto out;
 816
 817  isize =BUFSIZE_INDEX(size);
 818 spin_lock(&free_list[isize].lock);
 819  bh = free_list[isize].list;
 820 if(bh) {
 821 __remove_from_free_list(bh, isize);
 822 atomic_set(&bh->b_count,1);
 823 }
 824 spin_unlock(&free_list[isize].lock);
 825
 826 /*
 827  * OK, FINALLY we know that this buffer is the only one of
 828  * its kind, we hold a reference (b_count>0), it is unlocked,
 829  * and it is clean.
 830  */
 831 if(bh) {
 832 init_buffer(bh, end_buffer_io_sync, NULL);
 833  bh->b_dev = dev;
 834  bh->b_blocknr = block;
 835  bh->b_state =1<< BH_Mapped;
 836
 837 /* Insert the buffer into the regular lists */
 838 __insert_into_queues(bh);
 839  out:
 840 write_unlock(&hash_table_lock);
 841 spin_unlock(&lru_list_lock);
 842 touch_buffer(bh);
 843 return bh;
 844 }
 845
 846 /*
 847  * If we block while refilling the free list, somebody may
 848  * create the buffer first ... search the hashes again.
 849  */
 850 write_unlock(&hash_table_lock);
 851 spin_unlock(&lru_list_lock);
 852 refill_freelist(size);
 853 goto repeat;
 854 }
 855
 856 /* -1 -> no need to flush
 857  0 -> async flush
 858  1 -> sync flush (wait for I/O completation) */
 859 static intbalance_dirty_state(kdev_t dev)
 860 {
 861 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 862
 863  dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 864  tot =nr_free_buffer_pages();
 865  tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 866
 867  dirty *=200;
 868  soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 869  hard_dirty_limit = soft_dirty_limit *2;
 870
 871 if(dirty > soft_dirty_limit) {
 872 if(dirty > hard_dirty_limit)
 873 return1;
 874 return0;
 875 }
 876 return-1;
 877 }
 878
 879 /*
 880  * if a new dirty buffer is created we need to balance bdflush.
 881  *
 882  * in the future we might want to make bdflush aware of different
 883  * pressures on different devices - thus the (currently unused)
 884  * 'dev' parameter.
 885  */
 886 voidbalance_dirty(kdev_t dev)
 887 {
 888 int state =balance_dirty_state(dev);
 889
 890 if(state <0)
 891 return;
 892 wakeup_bdflush(state);
 893 }
 894
 895 static __inline__ void__mark_dirty(struct buffer_head *bh,int flag)
 896 {
 897  bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
 898 refile_buffer(bh);
 899 }
 900
 901 /* atomic version, the user must call balance_dirty() by hand
 902  as soon as it become possible to block */
 903 void__mark_buffer_dirty(struct buffer_head *bh,int flag)
 904 {
 905 if(!atomic_set_buffer_dirty(bh))
 906 __mark_dirty(bh, flag);
 907 }
 908
 909 voidmark_buffer_dirty(struct buffer_head *bh,int flag)
 910 {
 911 __mark_buffer_dirty(bh, flag);
 912 balance_dirty(bh->b_dev);
 913 }
 914
 915 /*
 916  * A buffer may need to be moved from one buffer list to another
 917  * (e.g. in case it is not shared any more). Handle this.
 918  */
 919 static void__refile_buffer(struct buffer_head *bh)
 920 {
 921 int dispose = BUF_CLEAN;
 922 if(buffer_locked(bh))
 923  dispose = BUF_LOCKED;
 924 if(buffer_dirty(bh))
 925  dispose = BUF_DIRTY;
 926 if(buffer_protected(bh))
 927  dispose = BUF_PROTECTED;
 928 if(dispose != bh->b_list) {
 929 __remove_from_lru_list(bh, bh->b_list);
 930  bh->b_list = dispose;
 931 __insert_into_lru_list(bh, dispose);
 932 }
 933 }
 934
 935 voidrefile_buffer(struct buffer_head *bh)
 936 {
 937 spin_lock(&lru_list_lock);
 938 __refile_buffer(bh);
 939 spin_unlock(&lru_list_lock);
 940 }
 941
 942 /*
 943  * Release a buffer head
 944  */
 945 void__brelse(struct buffer_head * buf)
 946 {
 947 if(atomic_read(&buf->b_count)) {
 948 atomic_dec(&buf->b_count);
 949 return;
 950 }
 951 printk("VFS: brelse: Trying to free free buffer\n");
 952 }
 953
 954 /*
 955  * bforget() is like brelse(), except it puts the buffer on the
 956  * free list if it can.. We can NOT free the buffer if:
 957  * - there are other users of it
 958  * - it is locked and thus can have active IO
 959  */
 960 void__bforget(struct buffer_head * buf)
 961 {
 962 /* grab the lru lock here to block bdflush. */
 963 spin_lock(&lru_list_lock);
 964 write_lock(&hash_table_lock);
 965 if(!atomic_dec_and_test(&buf->b_count) ||buffer_locked(buf))
 966 goto in_use;
 967 __hash_unlink(buf);
 968 write_unlock(&hash_table_lock);
 969 __remove_from_lru_list(buf, buf->b_list);
 970 spin_unlock(&lru_list_lock);
 971 put_last_free(buf);
 972 return;
 973
 974  in_use:
 975 write_unlock(&hash_table_lock);
 976 spin_unlock(&lru_list_lock);
 977 }
 978
 979 /*
 980  * bread() reads a specified block and returns the buffer that contains
 981  * it. It returns NULL if the block was unreadable.
 982  */
 983 struct buffer_head *bread(kdev_t dev,int block,int size)
 984 {
 985 struct buffer_head * bh;
 986
 987  bh =getblk(dev, block, size);
 988 if(buffer_uptodate(bh))
 989 return bh;
 990 ll_rw_block(READ,1, &bh);
 991 wait_on_buffer(bh);
 992 if(buffer_uptodate(bh))
 993 return bh;
 994 brelse(bh);
 995 return NULL;
 996 }
 997
 998 /*
 999  * Ok, breada can be used as bread, but additionally to mark other
1000  * blocks for reading as well. End the argument list with a negative
1001  * number.
1002  */
1003
1004 #define NBUF 16
1005
1006 struct buffer_head *breada(kdev_t dev,int block,int bufsize,
1007 unsigned int pos,unsigned int filesize)
1008 {
1009 struct buffer_head * bhlist[NBUF];
1010 unsigned int blocks;
1011 struct buffer_head * bh;
1012 int index;
1013 int i, j;
1014
1015 if(pos >= filesize)
1016 return NULL;
1017
1018 if(block <0)
1019 return NULL;
1020
1021  bh =getblk(dev, block, bufsize);
1022  index =BUFSIZE_INDEX(bh->b_size);
1023
1024 if(buffer_uptodate(bh))
1025 return(bh);
1026 elsell_rw_block(READ,1, &bh);
1027
1028  blocks = (filesize - pos) >> (9+index);
1029
1030 if(blocks < (read_ahead[MAJOR(dev)] >> index))
1031  blocks = read_ahead[MAJOR(dev)] >> index;
1032 if(blocks > NBUF)
1033  blocks = NBUF;
1034
1035 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1036
1037  bhlist[0] = bh;
1038  j =1;
1039 for(i=1; i<blocks; i++) {
1040  bh =getblk(dev,block+i,bufsize);
1041 if(buffer_uptodate(bh)) {
1042 brelse(bh);
1043 break;
1044 }
1045 else bhlist[j++] = bh;
1046 }
1047
1048 /* Request the read for these buffers, and then release them. */
1049 if(j>1)
1050 ll_rw_block(READA, (j-1), bhlist+1);
1051 for(i=1; i<j; i++)
1052 brelse(bhlist[i]);
1053
1054 /* Wait for this buffer, and then continue on. */
1055  bh = bhlist[0];
1056 wait_on_buffer(bh);
1057 if(buffer_uptodate(bh))
1058 return bh;
1059 brelse(bh);
1060 return NULL;
1061 }
1062
1063 /*
1064  * Note: the caller should wake up the buffer_wait list if needed.
1065  */
1066 static __inline__ void__put_unused_buffer_head(struct buffer_head * bh)
1067 {
1068 if(nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1069 kmem_cache_free(bh_cachep, bh);
1070 }else{
1071  bh->b_blocknr = -1;
1072 init_waitqueue_head(&bh->b_wait);
1073  nr_unused_buffer_heads++;
1074  bh->b_next_free = unused_list;
1075  bh->b_this_page = NULL;
1076  unused_list = bh;
1077 }
1078 }
1079
1080 /*
1081  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1082  * no-buffer-head deadlock. Return NULL on failure; waiting for
1083  * buffer heads is now handled in create_buffers().
1084  */
1085 static struct buffer_head *get_unused_buffer_head(int async)
1086 {
1087 struct buffer_head * bh;
1088
1089 spin_lock(&unused_list_lock);
1090 if(nr_unused_buffer_heads > NR_RESERVED) {
1091  bh = unused_list;
1092  unused_list = bh->b_next_free;
1093  nr_unused_buffer_heads--;
1094 spin_unlock(&unused_list_lock);
1095 return bh;
1096 }
1097 spin_unlock(&unused_list_lock);
1098
1099 /* This is critical. We can't swap out pages to get
1100  * more buffer heads, because the swap-out may need
1101  * more buffer-heads itself. Thus SLAB_BUFFER.
1102  */
1103 if((bh =kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1104 memset(bh,0,sizeof(*bh));
1105 init_waitqueue_head(&bh->b_wait);
1106 return bh;
1107 }
1108
1109 /*
1110  * If we need an async buffer, use the reserved buffer heads.
1111  */
1112 if(async) {
1113 spin_lock(&unused_list_lock);
1114 if(unused_list) {
1115  bh = unused_list;
1116  unused_list = bh->b_next_free;
1117  nr_unused_buffer_heads--;
1118 spin_unlock(&unused_list_lock);
1119 return bh;
1120 }
1121 spin_unlock(&unused_list_lock);
1122 }
1123 #if 0
1124 /*
1125  * (Pending further analysis ...)
1126  * Ordinary (non-async) requests can use a different memory priority
1127  * to free up pages. Any swapping thus generated will use async
1128  * buffer heads.
1129  */
1130 if(!async &&
1131 (bh =kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1132 memset(bh,0,sizeof(*bh));
1133 init_waitqueue_head(&bh->b_wait);
1134 return bh;
1135 }
1136 #endif
1137
1138 return NULL;
1139 }
1140
1141 voidset_bh_page(struct buffer_head *bh,struct page *page,unsigned long offset)
1142 {
1143  bh->b_page = page;
1144 if(offset >= PAGE_SIZE)
1145 BUG();
1146 if(PageHighMem(page))
1147 /*
1148  * This catches illegal uses and preserves the offset:
1149  */
1150  bh->b_data = (char*)(0+ offset);
1151 else
1152  bh->b_data =page_address(page) + offset;
1153 }
1154
1155 /*
1156  * Create the appropriate buffers when given a page for data area and
1157  * the size of each buffer.. Use the bh->b_this_page linked list to
1158  * follow the buffers created. Return NULL if unable to create more
1159  * buffers.
1160  * The async flag is used to differentiate async IO (paging, swapping)
1161  * from ordinary buffer allocations, and only async requests are allowed
1162  * to sleep waiting for buffer heads.
1163  */
1164 static struct buffer_head *create_buffers(struct page * page,unsigned long size,int async)
1165 {
1166 struct buffer_head *bh, *head;
1167 long offset;
1168
1169 try_again:
1170  head = NULL;
1171  offset = PAGE_SIZE;
1172 while((offset -= size) >=0) {
1173  bh =get_unused_buffer_head(async);
1174 if(!bh)
1175 goto no_grow;
1176
1177  bh->b_dev = B_FREE;/* Flag as unused */
1178  bh->b_this_page = head;
1179  head = bh;
1180
1181  bh->b_state =0;
1182  bh->b_next_free = NULL;
1183  bh->b_pprev = NULL;
1184 atomic_set(&bh->b_count,0);
1185  bh->b_size = size;
1186
1187 set_bh_page(bh, page, offset);
1188
1189  bh->b_list = BUF_CLEAN;
1190  bh->b_end_io = end_buffer_io_bad;
1191 }
1192 return head;
1193 /*
1194  * In case anything failed, we just free everything we got.
1195  */
1196 no_grow:
1197 if(head) {
1198 spin_lock(&unused_list_lock);
1199 do{
1200  bh = head;
1201  head = head->b_this_page;
1202 __put_unused_buffer_head(bh);
1203 }while(head);
1204 spin_unlock(&unused_list_lock);
1205
1206 /* Wake up any waiters ... */
1207 wake_up(&buffer_wait);
1208 }
1209
1210 /*
1211  * Return failure for non-async IO requests. Async IO requests
1212  * are not allowed to fail, so we have to wait until buffer heads
1213  * become available. But we don't want tasks sleeping with
1214  * partially complete buffers, so all were released above.
1215  */
1216 if(!async)
1217 return NULL;
1218
1219 /* We're _really_ low on memory. Now we just
1220  * wait for old buffer heads to become free due to
1221  * finishing IO. Since this is an async request and
1222  * the reserve list is empty, we're sure there are
1223  * async buffer heads in use.
1224  */
1225 run_task_queue(&tq_disk);
1226
1227 /*
1228  * Set our state for sleeping, then check again for buffer heads.
1229  * This ensures we won't miss a wake_up from an interrupt.
1230  */
1231 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1232 goto try_again;
1233 }
1234
1235 static intcreate_page_buffers(int rw,struct page *page, kdev_t dev,int b[],int size)
1236 {
1237 struct buffer_head *head, *bh, *tail;
1238 int block;
1239
1240 if(!PageLocked(page))
1241 BUG();
1242 /*
1243  * Allocate async buffer heads pointing to this page, just for I/O.
1244  * They don't show up in the buffer hash table, but they *are*
1245  * registered in page->buffers.
1246  */
1247  head =create_buffers(page, size,1);
1248 if(page->buffers)
1249 BUG();
1250 if(!head)
1251 BUG();
1252  tail = head;
1253 for(bh = head; bh; bh = bh->b_this_page) {
1254  block = *(b++);
1255
1256  tail = bh;
1257 init_buffer(bh, end_buffer_io_async, NULL);
1258  bh->b_dev = dev;
1259  bh->b_blocknr = block;
1260
1261 set_bit(BH_Mapped, &bh->b_state);
1262 }
1263  tail->b_this_page = head;
1264 page_cache_get(page);
1265  page->buffers = head;
1266 return0;
1267 }
1268
1269 static voidunmap_buffer(struct buffer_head * bh)
1270 {
1271 if(buffer_mapped(bh)) {
1272 mark_buffer_clean(bh);
1273 wait_on_buffer(bh);
1274 clear_bit(BH_Uptodate, &bh->b_state);
1275 clear_bit(BH_Mapped, &bh->b_state);
1276 clear_bit(BH_Req, &bh->b_state);
1277 clear_bit(BH_New, &bh->b_state);
1278 }
1279 }
1280
1281 /*
1282  * We don't have to release all buffers here, but
1283  * we have to be sure that no dirty buffer is left
1284  * and no IO is going on (no buffer is locked), because
1285  * we have truncated the file and are going to free the
1286  * blocks on-disk..
1287  */
1288 intblock_flushpage(struct page *page,unsigned long offset)
1289 {
1290 struct buffer_head *head, *bh, *next;
1291 unsigned int curr_off =0;
1292
1293 if(!PageLocked(page))
1294 BUG();
1295 if(!page->buffers)
1296 return1;
1297
1298  head = page->buffers;
1299  bh = head;
1300 do{
1301 unsigned int next_off = curr_off + bh->b_size;
1302  next = bh->b_this_page;
1303
1304 /*
1305  * is this block fully flushed?
1306  */
1307 if(offset <= curr_off)
1308 unmap_buffer(bh);
1309  curr_off = next_off;
1310  bh = next;
1311 }while(bh != head);
1312
1313 /*
1314  * subtle. We release buffer-heads only if this is
1315  * the 'final' flushpage. We have invalidated the get_block
1316  * cached value unconditionally, so real IO is not
1317  * possible anymore.
1318  *
1319  * If the free doesn't work out, the buffers can be
1320  * left around - they just turn into anonymous buffers
1321  * instead.
1322  */
1323 if(!offset) {
1324 if(!try_to_free_buffers(page,0)) {
1325 atomic_inc(&buffermem_pages);
1326 return0;
1327 }
1328 }
1329
1330 return1;
1331 }
1332
1333 static voidcreate_empty_buffers(struct page *page,struct inode *inode,unsigned long blocksize)
1334 {
1335 struct buffer_head *bh, *head, *tail;
1336
1337  head =create_buffers(page, blocksize,1);
1338 if(page->buffers)
1339 BUG();
1340
1341  bh = head;
1342 do{
1343  bh->b_dev = inode->i_dev;
1344  bh->b_blocknr =0;
1345  bh->b_end_io = end_buffer_io_bad;
1346  tail = bh;
1347  bh = bh->b_this_page;
1348 }while(bh);
1349  tail->b_this_page = head;
1350  page->buffers = head;
1351 page_cache_get(page);
1352 }
1353
1354 /*
1355  * We are taking a block for data and we don't want any output from any
1356  * buffer-cache aliases starting from return from that function and
1357  * until the moment when something will explicitly mark the buffer
1358  * dirty (hopefully that will not happen until we will free that block ;-)
1359  * We don't even need to mark it not-uptodate - nobody can expect
1360  * anything from a newly allocated buffer anyway. We used to used
1361  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1362  * don't want to mark the alias unmapped, for example - it would confuse
1363  * anyone who might pick it with bread() afterwards...
1364  */
1365
1366 static voidunmap_underlying_metadata(struct buffer_head * bh)
1367 {
1368 struct buffer_head *old_bh;
1369
1370  old_bh =get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1371 if(old_bh) {
1372 mark_buffer_clean(old_bh);
1373 wait_on_buffer(old_bh);
1374 clear_bit(BH_Req, &old_bh->b_state);
1375 /* Here we could run brelse or bforget. We use
1376  bforget because it will try to put the buffer
1377  in the freelist. */
1378 __bforget(old_bh);
1379 }
1380 }
1381
1382 /*
1383  * block_write_full_page() is SMP-safe - currently it's still
1384  * being called with the kernel lock held, but the code is ready.
1385  */
1386 static int__block_write_full_page(struct inode *inode,struct page *page, get_block_t *get_block)
1387 {
1388 int err, i, need_balance_dirty =0;
1389 unsigned long block;
1390 struct buffer_head *bh, *head;
1391
1392 if(!PageLocked(page))
1393 BUG();
1394
1395 if(!page->buffers)
1396 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1397  head = page->buffers;
1398
1399  block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1400
1401  bh = head;
1402  i =0;
1403 do{
1404 /*
1405  * If the buffer isn't up-to-date, we can't be sure
1406  * that the buffer has been initialized with the proper
1407  * block number information etc..
1408  *
1409  * Leave it to the low-level FS to make all those
1410  * decisions (block #0 may actually be a valid block)
1411  */
1412  bh->b_end_io = end_buffer_io_sync;
1413 if(!buffer_mapped(bh)) {
1414  err =get_block(inode, block, bh,1);
1415 if(err)
1416 goto out;
1417 if(buffer_new(bh))
1418 unmap_underlying_metadata(bh);
1419 }
1420 set_bit(BH_Uptodate, &bh->b_state);
1421 if(!atomic_set_buffer_dirty(bh)) {
1422 __mark_dirty(bh,0);
1423  need_balance_dirty =1;
1424 }
1425
1426  bh = bh->b_this_page;
1427  block++;
1428 }while(bh != head);
1429
1430 if(need_balance_dirty)
1431 balance_dirty(bh->b_dev);
1432
1433 SetPageUptodate(page);
1434 return0;
1435 out:
1436 ClearPageUptodate(page);
1437 return err;
1438 }
1439
1440 static int__block_prepare_write(struct inode *inode,struct page *page,
1441 unsigned from,unsigned to, get_block_t *get_block)
1442 {
1443 unsigned block_start, block_end;
1444 unsigned long block;
1445 int err =0;
1446 unsigned blocksize, bbits;
1447 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1448 char*kaddr = (char*)kmap(page);
1449
1450  blocksize = inode->i_sb->s_blocksize;
1451 if(!page->buffers)
1452 create_empty_buffers(page, inode, blocksize);
1453  head = page->buffers;
1454
1455  bbits = inode->i_sb->s_blocksize_bits;
1456  block = page->index << (PAGE_CACHE_SHIFT - bbits);
1457
1458 for(bh = head, block_start =0; bh != head || !block_start;
1459  block++, block_start=block_end, bh = bh->b_this_page) {
1460 if(!bh)
1461 BUG();
1462  block_end = block_start+blocksize;
1463 if(block_end <= from)
1464 continue;
1465 if(block_start >= to)
1466 break;
1467  bh->b_end_io = end_buffer_io_sync;
1468 if(!buffer_mapped(bh)) {
1469  err =get_block(inode, block, bh,1);
1470 if(err)
1471 goto out;
1472 if(buffer_new(bh)) {
1473 unmap_underlying_metadata(bh);
1474 if(block_end > to)
1475 memset(kaddr+to,0, block_end-to);
1476 if(block_start < from)
1477 memset(kaddr+block_start,0, from-block_start);
1478 if(block_end > to || block_start < from)
1479 flush_dcache_page(page);
1480 continue;
1481 }
1482 }
1483 if(!buffer_uptodate(bh) &&
1484 (block_start < from || block_end > to)) {
1485 ll_rw_block(READ,1, &bh);
1486 *wait_bh++=bh;
1487 }
1488 }
1489 /*
1490  * If we issued read requests - let them complete.
1491  */
1492 while(wait_bh > wait) {
1493 wait_on_buffer(*--wait_bh);
1494  err = -EIO;
1495 if(!buffer_uptodate(*wait_bh))
1496 goto out;
1497 }
1498 return0;
1499 out:
1500 return err;
1501 }
1502
1503 static int__block_commit_write(struct inode *inode,struct page *page,
1504 unsigned from,unsigned to)
1505 {
1506 unsigned block_start, block_end;
1507 int partial =0, need_balance_dirty =0;
1508 unsigned blocksize;
1509 struct buffer_head *bh, *head;
1510
1511  blocksize = inode->i_sb->s_blocksize;
1512
1513 for(bh = head = page->buffers, block_start =0;
1514  bh != head || !block_start;
1515  block_start=block_end, bh = bh->b_this_page) {
1516  block_end = block_start + blocksize;
1517 if(block_end <= from || block_start >= to) {
1518 if(!buffer_uptodate(bh))
1519  partial =1;
1520 }else{
1521 set_bit(BH_Uptodate, &bh->b_state);
1522 if(!atomic_set_buffer_dirty(bh)) {
1523 __mark_dirty(bh,0);
1524  need_balance_dirty =1;
1525 }
1526 }
1527 }
1528
1529 if(need_balance_dirty)
1530 balance_dirty(bh->b_dev);
1531 /*
1532  * is this a partial write that happened to make all buffers
1533  * uptodate then we can optimize away a bogus readpage() for
1534  * the next read(). Here we 'discover' wether the page went
1535  * uptodate as a result of this (potentially partial) write.
1536  */
1537 if(!partial)
1538 SetPageUptodate(page);
1539 return0;
1540 }
1541
1542 /*
1543  * Generic "read page" function for block devices that have the normal
1544  * get_block functionality. This is most of the block device filesystems.
1545  * Reads the page asynchronously --- the unlock_buffer() and
1546  * mark_buffer_uptodate() functions propagate buffer state into the
1547  * page struct once IO has completed.
1548  */
1549 intblock_read_full_page(struct page *page, get_block_t *get_block)
1550 {
1551 struct inode *inode = (struct inode*)page->mapping->host;
1552 unsigned long iblock, lblock;
1553 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1554 unsigned int blocksize, blocks;
1555 unsigned long kaddr =0;
1556 int nr, i;
1557
1558 if(!PageLocked(page))
1559 PAGE_BUG(page);
1560  blocksize = inode->i_sb->s_blocksize;
1561 if(!page->buffers)
1562 create_empty_buffers(page, inode, blocksize);
1563  head = page->buffers;
1564
1565  blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1566  iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1567  lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1568  bh = head;
1569  nr =0;
1570  i =0;
1571
1572 do{
1573 if(buffer_uptodate(bh))
1574 continue;
1575
1576 if(!buffer_mapped(bh)) {
1577 if(iblock < lblock)
1578 get_block(inode, iblock, bh,0);
1579 if(!buffer_mapped(bh)) {
1580 if(!kaddr)
1581  kaddr =kmap(page);
1582 memset((char*)(kaddr + i*blocksize),0, blocksize);
1583 flush_dcache_page(page);
1584 set_bit(BH_Uptodate, &bh->b_state);
1585 continue;
1586 }
1587 }
1588
1589 init_buffer(bh, end_buffer_io_async, NULL);
1590 atomic_inc(&bh->b_count);
1591  arr[nr] = bh;
1592  nr++;
1593 }while(i++, iblock++, (bh = bh->b_this_page) != head);
1594
1595 if(nr) {
1596 if(Page_Uptodate(page))
1597 BUG();
1598 ll_rw_block(READ, nr, arr);
1599 }else{
1600 /*
1601  * all buffers are uptodate - we can set the page
1602  * uptodate as well.
1603  */
1604 SetPageUptodate(page);
1605 UnlockPage(page);
1606 }
1607 if(kaddr)
1608 kunmap(page);
1609 return0;
1610 }
1611
1612 /*
1613  * For moronic filesystems that do not allow holes in file.
1614  * We may have to extend the file.
1615  */
1616
1617 intcont_prepare_write(struct page *page,unsigned offset,unsigned to, get_block_t *get_block,unsigned long*bytes)
1618 {
1619 struct address_space *mapping = page->mapping;
1620 struct inode *inode = (struct inode*)mapping->host;
1621 struct page *new_page;
1622 unsigned long pgpos;
1623 long status;
1624 unsigned zerofrom;
1625 unsigned blocksize = inode->i_sb->s_blocksize;
1626 char*kaddr;
1627
1628 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1629  status = -ENOMEM;
1630  new_page =grab_cache_page(mapping, pgpos);
1631 if(!new_page)
1632 goto out;
1633 /* we might sleep */
1634 if(*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1635 UnlockPage(new_page);
1636 page_cache_release(new_page);
1637 continue;
1638 }
1639  zerofrom = *bytes & ~PAGE_CACHE_MASK;
1640 if(zerofrom & (blocksize-1)) {
1641 *bytes |= (blocksize-1);
1642 (*bytes)++;
1643 }
1644  status =__block_prepare_write(inode, new_page, zerofrom,
1645  PAGE_CACHE_SIZE, get_block);
1646 if(status)
1647 goto out_unmap;
1648  kaddr =page_address(new_page);
1649 memset(kaddr+zerofrom,0, PAGE_CACHE_SIZE-zerofrom);
1650 flush_dcache_page(new_page);
1651 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1652 kunmap(new_page);
1653 UnlockPage(new_page);
1654 page_cache_release(new_page);
1655 }
1656
1657 if(page->index < pgpos) {
1658 /* completely inside the area */
1659  zerofrom = offset;
1660 }else{
1661 /* page covers the boundary, find the boundary offset */
1662  zerofrom = *bytes & ~PAGE_CACHE_MASK;
1663
1664 /* if we will expand the thing last block will be filled */
1665 if(to > zerofrom && (zerofrom & (blocksize-1))) {
1666 *bytes |= (blocksize-1);
1667 (*bytes)++;
1668 }
1669
1670 /* starting below the boundary? Nothing to zero out */
1671 if(offset <= zerofrom)
1672  zerofrom = offset;
1673 }
1674  status =__block_prepare_write(inode, page, zerofrom, to, get_block);
1675 if(status)
1676 goto out1;
1677  kaddr =page_address(page);
1678 if(zerofrom < offset) {
1679 memset(kaddr+zerofrom,0, offset-zerofrom);
1680 flush_dcache_page(page);
1681 __block_commit_write(inode, page, zerofrom, offset);
1682 }
1683 return0;
1684 out1:
1685 ClearPageUptodate(page);
1686 kunmap(page);
1687 return status;
1688
1689 out_unmap:
1690 ClearPageUptodate(new_page);
1691 kunmap(new_page);
1692 UnlockPage(new_page);
1693 page_cache_release(new_page);
1694 out:
1695 return status;
1696 }
1697
1698 intblock_prepare_write(struct page *page,unsigned from,unsigned to,
1699  get_block_t *get_block)
1700 {
1701 struct inode *inode = (struct inode*)page->mapping->host;
1702 int err =__block_prepare_write(inode, page, from, to, get_block);
1703 if(err) {
1704 ClearPageUptodate(page);
1705 kunmap(page);
1706 }
1707 return err;
1708 }
1709
1710 intgeneric_commit_write(struct file *file,struct page *page,
1711 unsigned from,unsigned to)
1712 {
1713 struct inode *inode = (struct inode*)page->mapping->host;
1714  loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1715 __block_commit_write(inode,page,from,to);
1716 kunmap(page);
1717 if(pos > inode->i_size) {
1718  inode->i_size = pos;
1719 mark_inode_dirty(inode);
1720 }
1721 return0;
1722 }
1723
1724 intblock_write_full_page(struct page *page, get_block_t *get_block)
1725 {
1726 struct inode *inode = (struct inode*)page->mapping->host;
1727 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1728 unsigned offset;
1729 int err;
1730
1731 /* easy case */
1732 if(page->index < end_index)
1733 return__block_write_full_page(inode, page, get_block);
1734
1735 /* things got complicated... */
1736  offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1737 /* OK, are we completely out? */
1738 if(page->index >= end_index+1|| !offset)
1739 return-EIO;
1740 /* Sigh... will have to work, then... */
1741  err =__block_prepare_write(inode, page,0, offset, get_block);
1742 if(!err) {
1743 memset(page_address(page) + offset,0, PAGE_CACHE_SIZE - offset);
1744 flush_dcache_page(page);
1745 __block_commit_write(inode,page,0,offset);
1746 done:
1747 kunmap(page);
1748 return err;
1749 }
1750 ClearPageUptodate(page);
1751 goto done;
1752 }
1753
1754 intgeneric_block_bmap(struct address_space *mapping,long block, get_block_t *get_block)
1755 {
1756 struct buffer_head tmp;
1757 struct inode *inode = (struct inode*)mapping->host;
1758  tmp.b_state =0;
1759  tmp.b_blocknr =0;
1760 get_block(inode, block, &tmp,0);
1761 return tmp.b_blocknr;
1762 }
1763
1764 /*
1765  * IO completion routine for a buffer_head being used for kiobuf IO: we
1766  * can't dispatch the kiobuf callback until io_count reaches 0.
1767  */
1768
1769 static voidend_buffer_io_kiobuf(struct buffer_head *bh,int uptodate)
1770 {
1771 struct kiobuf *kiobuf;
1772
1773 mark_buffer_uptodate(bh, uptodate);
1774
1775  kiobuf = bh->b_private;
1776 unlock_buffer(bh);
1777 end_kio_request(kiobuf, uptodate);
1778 }
1779
1780
1781 /*
1782  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1783  * for them to complete. Clean up the buffer_heads afterwards.
1784  */
1785
1786 static intwait_kio(int rw,int nr,struct buffer_head *bh[],int size)
1787 {
1788 int iosize;
1789 int i;
1790 struct buffer_head *tmp;
1791
1792
1793  iosize =0;
1794 spin_lock(&unused_list_lock);
1795
1796 for(i = nr; --i >=0; ) {
1797  iosize += size;
1798  tmp = bh[i];
1799 if(buffer_locked(tmp)) {
1800 spin_unlock(&unused_list_lock);
1801 wait_on_buffer(tmp);
1802 spin_lock(&unused_list_lock);
1803 }
1804
1805 if(!buffer_uptodate(tmp)) {
1806 /* We are traversing bh'es in reverse order so
1807  clearing iosize on error calculates the
1808  amount of IO before the first error. */
1809  iosize =0;
1810 }
1811 __put_unused_buffer_head(tmp);
1812 }
1813
1814 spin_unlock(&unused_list_lock);
1815
1816 return iosize;
1817 }
1818
1819 /*
1820  * Start I/O on a physical range of kernel memory, defined by a vector
1821  * of kiobuf structs (much like a user-space iovec list).
1822  *
1823  * The kiobuf must already be locked for IO. IO is submitted
1824  * asynchronously: you need to check page->locked, page->uptodate, and
1825  * maybe wait on page->wait.
1826  *
1827  * It is up to the caller to make sure that there are enough blocks
1828  * passed in to completely map the iobufs to disk.
1829  */
1830
1831 intbrw_kiovec(int rw,int nr,struct kiobuf *iovec[],
1832  kdev_t dev,unsigned long b[],int size)
1833 {
1834 int err;
1835 int length;
1836 int transferred;
1837 int i;
1838 int bufind;
1839 int pageind;
1840 int bhind;
1841 int offset;
1842 int sectors = size>>9;
1843 unsigned long blocknr;
1844 struct kiobuf * iobuf = NULL;
1845 struct page * map;
1846 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1847
1848 if(!nr)
1849 return0;
1850
1851 /*
1852  * First, do some alignment and validity checks
1853  */
1854 for(i =0; i < nr; i++) {
1855  iobuf = iovec[i];
1856 if((iobuf->offset & (size-1)) ||
1857 (iobuf->length & (size-1)))
1858 return-EINVAL;
1859 if(!iobuf->nr_pages)
1860 panic("brw_kiovec: iobuf not initialised");
1861 }
1862
1863 /*
1864  * OK to walk down the iovec doing page IO on each page we find.
1865  */
1866  bufind = bhind = transferred = err =0;
1867 for(i =0; i < nr; i++) {
1868  iobuf = iovec[i];
1869  offset = iobuf->offset;
1870  length = iobuf->length;
1871  iobuf->errno =0;
1872
1873 for(pageind =0; pageind < iobuf->nr_pages; pageind++) {
1874  map = iobuf->maplist[pageind];
1875 if(!map) {
1876  err = -EFAULT;
1877 goto error;
1878 }
1879
1880 while(length >0) {
1881  blocknr = b[bufind++];
1882  tmp =get_unused_buffer_head(0);
1883 if(!tmp) {
1884  err = -ENOMEM;
1885 goto error;
1886 }
1887
1888  tmp->b_dev = B_FREE;
1889  tmp->b_size = size;
1890 set_bh_page(tmp, map, offset);
1891  tmp->b_this_page = tmp;
1892
1893 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1894  tmp->b_rdev = tmp->b_dev = dev;
1895  tmp->b_blocknr = blocknr;
1896  tmp->b_rsector = blocknr*sectors;
1897  tmp->b_state = (1<< BH_Mapped) | (1<< BH_Lock) | (1<< BH_Req);
1898
1899 if(rw == WRITE) {
1900 set_bit(BH_Uptodate, &tmp->b_state);
1901 set_bit(BH_Dirty, &tmp->b_state);
1902 }
1903
1904  bh[bhind++] = tmp;
1905  length -= size;
1906  offset += size;
1907
1908 atomic_inc(&iobuf->io_count);
1909
1910 generic_make_request(rw, tmp);
1911 /*
1912  * Wait for IO if we have got too much
1913  */
1914 if(bhind >= KIO_MAX_SECTORS) {
1915  err =wait_kio(rw, bhind, bh, size);
1916 if(err >=0)
1917  transferred += err;
1918 else
1919 goto finished;
1920  bhind =0;
1921 }
1922
1923 if(offset >= PAGE_SIZE) {
1924  offset =0;
1925 break;
1926 }
1927 }/* End of block loop */
1928 }/* End of page loop */
1929 }/* End of iovec loop */
1930
1931 /* Is there any IO still left to submit? */
1932 if(bhind) {
1933  err =wait_kio(rw, bhind, bh, size);
1934 if(err >=0)
1935  transferred += err;
1936 else
1937 goto finished;
1938 }
1939
1940  finished:
1941 if(transferred)
1942 return transferred;
1943 return err;
1944
1945  error:
1946 /* We got an error allocating the bh'es. Just free the current
1947  buffer_heads and exit. */
1948 spin_lock(&unused_list_lock);
1949 for(i = bhind; --i >=0; ) {
1950 __put_unused_buffer_head(bh[bhind]);
1951 }
1952 spin_unlock(&unused_list_lock);
1953 goto finished;
1954 }
1955
1956 /*
1957  * Start I/O on a page.
1958  * This function expects the page to be locked and may return
1959  * before I/O is complete. You then have to check page->locked,
1960  * page->uptodate, and maybe wait on page->wait.
1961  *
1962  * brw_page() is SMP-safe, although it's being called with the
1963  * kernel lock held - but the code is ready.
1964  *
1965  * FIXME: we need a swapper_inode->get_block function to remove
1966  * some of the bmap kludges and interface ugliness here.
1967  */
1968 intbrw_page(int rw,struct page *page, kdev_t dev,int b[],int size)
1969 {
1970 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1971 int nr, fresh /* temporary debugging flag */, block;
1972
1973 if(!PageLocked(page))
1974 panic("brw_page: page not locked for I/O");
1975 // ClearPageError(page);
1976 /*
1977  * We pretty much rely on the page lock for this, because
1978  * create_page_buffers() might sleep.
1979  */
1980  fresh =0;
1981 if(!page->buffers) {
1982 create_page_buffers(rw, page, dev, b, size);
1983  fresh =1;
1984 }
1985 if(!page->buffers)
1986 BUG();
1987
1988  head = page->buffers;
1989  bh = head;
1990  nr =0;
1991 do{
1992  block = *(b++);
1993
1994 if(fresh && (atomic_read(&bh->b_count) !=0))
1995 BUG();
1996 if(rw == READ) {
1997 if(!fresh)
1998 BUG();
1999 if(!buffer_uptodate(bh)) {
2000  arr[nr++] = bh;
2001 atomic_inc(&bh->b_count);
2002 }
2003 }else{/* WRITE */
2004 if(!bh->b_blocknr) {
2005 if(!block)
2006 BUG();
2007  bh->b_blocknr = block;
2008 }else{
2009 if(!block)
2010 BUG();
2011 }
2012 set_bit(BH_Uptodate, &bh->b_state);
2013 set_bit(BH_Dirty, &bh->b_state);
2014  arr[nr++] = bh;
2015 atomic_inc(&bh->b_count);
2016 }
2017  bh = bh->b_this_page;
2018 }while(bh != head);
2019 if((rw == READ) && nr) {
2020 if(Page_Uptodate(page))
2021 BUG();
2022 ll_rw_block(rw, nr, arr);
2023 }else{
2024 if(!nr && rw == READ) {
2025 SetPageUptodate(page);
2026 UnlockPage(page);
2027 }
2028 if(nr && (rw == WRITE))
2029 ll_rw_block(rw, nr, arr);
2030 }
2031 return0;
2032 }
2033
2034 intblock_symlink(struct inode *inode,const char*symname,int len)
2035 {
2036 struct address_space *mapping = inode->i_mapping;
2037 struct page *page =grab_cache_page(mapping,0);
2038 int err = -ENOMEM;
2039 char*kaddr;
2040
2041 if(!page)
2042 goto fail;
2043  err = mapping->a_ops->prepare_write(NULL, page,0, len-1);
2044 if(err)
2045 goto fail_map;
2046  kaddr =page_address(page);
2047 memcpy(kaddr, symname, len-1);
2048  mapping->a_ops->commit_write(NULL, page,0, len-1);
2049 /*
2050  * Notice that we are _not_ going to block here - end of page is
2051  * unmapped, so this will only try to map the rest of page, see
2052  * that it is unmapped (typically even will not look into inode -
2053  * ->i_size will be enough for everything) and zero it out.
2054  * OTOH it's obviously correct and should make the page up-to-date.
2055  */
2056  err = mapping->a_ops->readpage(NULL, page);
2057 wait_on_page(page);
2058 page_cache_release(page);
2059 if(err <0)
2060 goto fail;
2061 mark_inode_dirty(inode);
2062 return0;
2063 fail_map:
2064 UnlockPage(page);
2065 page_cache_release(page);
2066 fail:
2067 return err;
2068 }
2069
2070 /*
2071  * Try to increase the number of buffers available: the size argument
2072  * is used to determine what kind of buffers we want.
2073  */
2074 static intgrow_buffers(int size)
2075 {
2076 struct page * page;
2077 struct buffer_head *bh, *tmp;
2078 struct buffer_head * insert_point;
2079 int isize;
2080
2081 if((size &511) || (size > PAGE_SIZE)) {
2082 printk("VFS: grow_buffers: size = %d\n",size);
2083 return0;
2084 }
2085
2086  page =alloc_page(GFP_BUFFER);
2087 if(!page)
2088 goto out;
2089  bh =create_buffers(page, size,0);
2090 if(!bh)
2091 goto no_buffer_head;
2092
2093  isize =BUFSIZE_INDEX(size);
2094
2095 spin_lock(&free_list[isize].lock);
2096  insert_point = free_list[isize].list;
2097  tmp = bh;
2098 while(1) {
2099 if(insert_point) {
2100  tmp->b_next_free = insert_point->b_next_free;
2101  tmp->b_prev_free = insert_point;
2102  insert_point->b_next_free->b_prev_free = tmp;
2103  insert_point->b_next_free = tmp;
2104 }else{
2105  tmp->b_prev_free = tmp;
2106  tmp->b_next_free = tmp;
2107 }
2108  insert_point = tmp;
2109 if(tmp->b_this_page)
2110  tmp = tmp->b_this_page;
2111 else
2112 break;
2113 }
2114  tmp->b_this_page = bh;
2115  free_list[isize].list = bh;
2116 spin_unlock(&free_list[isize].lock);
2117
2118  page->buffers = bh;
2119  page->flags &= ~(1<< PG_referenced);
2120 lru_cache_add(page);
2121 atomic_inc(&buffermem_pages);
2122 return1;
2123
2124 no_buffer_head:
2125 page_cache_release(page);
2126 out:
2127 return0;
2128 }
2129
2130 /*
2131  * Sync all the buffers on one page..
2132  *
2133  * If we have old buffers that are locked, we'll
2134  * wait on them, but we won't wait on the new ones
2135  * we're writing out now.
2136  *
2137  * This all is required so that we can free up memory
2138  * later.
2139  *
2140  * Wait:
2141  * 0 - no wait (this does not get called - see try_to_free_buffers below)
2142  * 1 - start IO for dirty buffers
2143  * 2 - wait for completion of locked buffers
2144  */
2145 static voidsync_page_buffers(struct buffer_head *bh,int wait)
2146 {
2147 struct buffer_head * tmp = bh;
2148
2149 do{
2150 struct buffer_head *p = tmp;
2151  tmp = tmp->b_this_page;
2152 if(buffer_locked(p)) {
2153 if(wait >1)
2154 __wait_on_buffer(p);
2155 }else if(buffer_dirty(p))
2156 ll_rw_block(WRITE,1, &p);
2157 }while(tmp != bh);
2158 }
2159
2160 /*
2161  * Can the buffer be thrown out?
2162  */
2163 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2164 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2165
2166 /*
2167  * try_to_free_buffers() checks if all the buffers on this particular page
2168  * are unused, and free's the page if so.
2169  *
2170  * Wake up bdflush() if this fails - if we're running low on memory due
2171  * to dirty buffers, we need to flush them out as quickly as possible.
2172  *
2173  * NOTE: There are quite a number of ways that threads of control can
2174  * obtain a reference to a buffer head within a page. So we must
2175  * lock out all of these paths to cleanly toss the page.
2176  */
2177 inttry_to_free_buffers(struct page * page,int wait)
2178 {
2179 struct buffer_head * tmp, * bh = page->buffers;
2180 int index =BUFSIZE_INDEX(bh->b_size);
2181
2182 spin_lock(&lru_list_lock);
2183 write_lock(&hash_table_lock);
2184 spin_lock(&free_list[index].lock);
2185  tmp = bh;
2186 do{
2187 struct buffer_head *p = tmp;
2188
2189  tmp = tmp->b_this_page;
2190 if(buffer_busy(p))
2191 goto busy_buffer_page;
2192 }while(tmp != bh);
2193
2194 spin_lock(&unused_list_lock);
2195  tmp = bh;
2196 do{
2197 struct buffer_head * p = tmp;
2198  tmp = tmp->b_this_page;
2199
2200 /* The buffer can be either on the regular
2201  * queues or on the free list..
2202  */
2203 if(p->b_dev != B_FREE)
2204 __remove_from_queues(p);
2205 else
2206 __remove_from_free_list(p, index);
2207 __put_unused_buffer_head(p);
2208 }while(tmp != bh);
2209 spin_unlock(&unused_list_lock);
2210
2211 /* Wake up anyone waiting for buffer heads */
2212 wake_up(&buffer_wait);
2213
2214 /* And free the page */
2215  page->buffers = NULL;
2216 page_cache_release(page);
2217 spin_unlock(&free_list[index].lock);
2218 write_unlock(&hash_table_lock);
2219 spin_unlock(&lru_list_lock);
2220 return1;
2221
2222 busy_buffer_page:
2223 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2224 spin_unlock(&free_list[index].lock);
2225 write_unlock(&hash_table_lock);
2226 spin_unlock(&lru_list_lock);
2227 if(wait)
2228 sync_page_buffers(bh, wait);
2229 return0;
2230 }
2231
2232 /* ================== Debugging =================== */
2233
2234 voidshow_buffers(void)
2235 {
2236 #ifdef CONFIG_SMP
2237 struct buffer_head * bh;
2238 int found =0, locked =0, dirty =0, used =0, lastused =0;
2239 intprotected=0;
2240 int nlist;
2241 static char*buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED", };
2242 #endif
2243
2244 printk("Buffer memory: %6dkB\n",
2245 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2246
2247 #ifdef CONFIG_SMP/* trylock does nothing on UP and so we could deadlock */
2248 if(!spin_trylock(&lru_list_lock))
2249 return;
2250 for(nlist =0; nlist < NR_LIST; nlist++) {
2251  found = locked = dirty = used = lastused =protected=0;
2252  bh = lru_list[nlist];
2253 if(!bh)continue;
2254
2255 do{
2256  found++;
2257 if(buffer_locked(bh))
2258  locked++;
2259 if(buffer_protected(bh))
2260 protected++;
2261 if(buffer_dirty(bh))
2262  dirty++;
2263 if(atomic_read(&bh->b_count))
2264  used++, lastused = found;
2265  bh = bh->b_next_free;
2266 }while(bh != lru_list[nlist]);
2267 {
2268 int tmp = nr_buffers_type[nlist];
2269 if(found != tmp)
2270 printk("%9s: BUG -> found %d, reported %d\n",
2271  buf_types[nlist], found, tmp);
2272 }
2273 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2274 "%d locked, %d protected, %d dirty\n",
2275  buf_types[nlist], found, size_buffers_type[nlist]>>10,
2276  used, lastused, locked,protected, dirty);
2277 }
2278 spin_unlock(&lru_list_lock);
2279 #endif
2280 }
2281
2282 /* ===================== Init ======================= */
2283
2284 /*
2285  * allocate the hash table and init the free list
2286  * Use gfp() for the hash table to decrease TLB misses, use
2287  * SLAB cache for buffer heads.
2288  */
2289 void __init buffer_init(unsigned long mempages)
2290 {
2291 int order, i;
2292 unsigned int nr_hash;
2293
2294 /* The buffer cache hash table is less important these days,
2295  * trim it a bit.
2296  */
2297  mempages >>=14;
2298
2299  mempages *=sizeof(struct buffer_head *);
2300
2301 for(order =0; (1<< order) < mempages; order++)
2302 ;
2303
2304 /* try to allocate something until we get it or we're asking
2305  for something that is really too small */
2306
2307 do{
2308 unsigned long tmp;
2309
2310  nr_hash = (PAGE_SIZE << order) /sizeof(struct buffer_head *);
2311  bh_hash_mask = (nr_hash -1);
2312
2313  tmp = nr_hash;
2314  bh_hash_shift =0;
2315 while((tmp >>=1UL) !=0UL)
2316  bh_hash_shift++;
2317
2318  hash_table = (struct buffer_head **)
2319 __get_free_pages(GFP_ATOMIC, order);
2320 }while(hash_table == NULL && --order >0);
2321 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2322  nr_hash, order, (PAGE_SIZE << order));
2323
2324 if(!hash_table)
2325 panic("Failed to allocate buffer hash table\n");
2326
2327 /* Setup hash chains. */
2328 for(i =0; i < nr_hash; i++)
2329  hash_table[i] = NULL;
2330
2331 /* Setup free lists. */
2332 for(i =0; i < NR_SIZES; i++) {
2333  free_list[i].list = NULL;
2334  free_list[i].lock = SPIN_LOCK_UNLOCKED;
2335 }
2336
2337 /* Setup lru lists. */
2338 for(i =0; i < NR_LIST; i++)
2339  lru_list[i] = NULL;
2340
2341 }
2342
2343
2344 /* ====================== bdflush support =================== */
2345
2346 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2347  * response to dirty buffers. Once this process is activated, we write back
2348  * a limited number of buffers to the disks and then go back to sleep again.
2349  */
2350 staticDECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2351 struct task_struct *bdflush_tsk =0;
2352
2353 voidwakeup_bdflush(int block)
2354 {
2355 DECLARE_WAITQUEUE(wait, current);
2356
2357 if(current == bdflush_tsk)
2358 return;
2359
2360 if(!block) {
2361 wake_up_process(bdflush_tsk);
2362 return;
2363 }
2364
2365 /* kflushd can wakeup us before we have a chance to
2366  go to sleep so we must be smart in handling
2367  this wakeup event from kflushd to avoid deadlocking in SMP
2368  (we are not holding any lock anymore in these two paths). */
2369 __set_current_state(TASK_UNINTERRUPTIBLE);
2370 add_wait_queue(&bdflush_done, &wait);
2371
2372 wake_up_process(bdflush_tsk);
2373 schedule();
2374
2375 remove_wait_queue(&bdflush_done, &wait);
2376 __set_current_state(TASK_RUNNING);
2377 }
2378
2379 /* This is the _only_ function that deals with flushing async writes
2380  to disk.
2381  NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2382  as all dirty buffers lives _only_ in the DIRTY lru list.
2383  As we never browse the LOCKED and CLEAN lru lists they are infact
2384  completly useless. */
2385 static intflush_dirty_buffers(int check_flushtime)
2386 {
2387 struct buffer_head * bh, *next;
2388 int flushed =0, i;
2389
2390  restart:
2391 spin_lock(&lru_list_lock);
2392  bh = lru_list[BUF_DIRTY];
2393 if(!bh)
2394 goto out_unlock;
2395 for(i = nr_buffers_type[BUF_DIRTY]; i-- >0; bh = next) {
2396  next = bh->b_next_free;
2397
2398 if(!buffer_dirty(bh)) {
2399 __refile_buffer(bh);
2400 continue;
2401 }
2402 if(buffer_locked(bh))
2403 continue;
2404
2405 if(check_flushtime) {
2406 /* The dirty lru list is chronologically ordered so
2407  if the current bh is not yet timed out,
2408  then also all the following bhs
2409  will be too young. */
2410 if(time_before(jiffies, bh->b_flushtime))
2411 goto out_unlock;
2412 }else{
2413 if(++flushed > bdf_prm.b_un.ndirty)
2414 goto out_unlock;
2415 }
2416
2417 /* OK, now we are committed to write it out. */
2418 atomic_inc(&bh->b_count);
2419 spin_unlock(&lru_list_lock);
2420 ll_rw_block(WRITE,1, &bh);
2421 atomic_dec(&bh->b_count);
2422
2423 if(current->need_resched)
2424 schedule();
2425 goto restart;
2426 }
2427  out_unlock:
2428 spin_unlock(&lru_list_lock);
2429
2430 return flushed;
2431 }
2432
2433 /*
2434  * Here we attempt to write back old buffers. We also try to flush inodes
2435  * and supers as well, since this function is essentially "update", and
2436  * otherwise there would be no way of ensuring that these quantities ever
2437  * get written back. Ideally, we would have a timestamp on the inodes
2438  * and superblocks so that we could write back only the old ones as well
2439  */
2440
2441 static intsync_old_buffers(void)
2442 {
2443 lock_kernel();
2444 sync_supers(0);
2445 sync_inodes(0);
2446 unlock_kernel();
2447
2448 flush_dirty_buffers(1);
2449 /* must really sync all the active I/O request to disk here */
2450 run_task_queue(&tq_disk);
2451 return0;
2452 }
2453
2454 intblock_sync_page(struct page *page)
2455 {
2456 run_task_queue(&tq_disk);
2457 return0;
2458 }
2459
2460 /* This is the interface to bdflush. As we get more sophisticated, we can
2461  * pass tuning parameters to this "process", to adjust how it behaves.
2462  * We would want to verify each parameter, however, to make sure that it
2463  * is reasonable. */
2464
2465 asmlinkage longsys_bdflush(int func,long data)
2466 {
2467 if(!capable(CAP_SYS_ADMIN))
2468 return-EPERM;
2469
2470 if(func ==1) {
2471 /* do_exit directly and let kupdate to do its work alone. */
2472 do_exit(0);
2473 #if 0/* left here as it's the only example of lazy-mm-stuff used from
2474  a syscall that doesn't care about the current mm context. */
2475 int error;
2476 struct mm_struct *user_mm;
2477
2478 /*
2479  * bdflush will spend all of it's time in kernel-space,
2480  * without touching user-space, so we can switch it into
2481  * 'lazy TLB mode' to reduce the cost of context-switches
2482  * to and from bdflush.
2483  */
2484  user_mm =start_lazy_tlb();
2485  error =sync_old_buffers();
2486 end_lazy_tlb(user_mm);
2487 return error;
2488 #endif
2489 }
2490
2491 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2492 if(func >=2) {
2493 int i = (func-2) >>1;
2494 if(i >=0&& i < N_PARAM) {
2495 if((func &1) ==0)
2496 returnput_user(bdf_prm.data[i], (int*)data);
2497
2498 if(data >= bdflush_min[i] && data <= bdflush_max[i]) {
2499  bdf_prm.data[i] = data;
2500 return0;
2501 }
2502 }
2503 return-EINVAL;
2504 }
2505
2506 /* Having func 0 used to launch the actual bdflush and then never
2507  * return (unless explicitly killed). We return zero here to
2508  * remain semi-compatible with present update(8) programs.
2509  */
2510 return0;
2511 }
2512
2513 /*
2514  * This is the actual bdflush daemon itself. It used to be started from
2515  * the syscall above, but now we launch it ourselves internally with
2516  * kernel_thread(...) directly after the first thread in init/main.c
2517  */
2518 intbdflush(void*sem)
2519 {
2520 struct task_struct *tsk = current;
2521 int flushed;
2522 /*
2523  * We have a bare-bones task_struct, and really should fill
2524  * in a few more things so "top" and /proc/2/{exe,root,cwd}
2525  * display semi-sane things. Not real crucial though...
2526  */
2527
2528  tsk->session =1;
2529  tsk->pgrp =1;
2530 strcpy(tsk->comm,"kflushd");
2531  bdflush_tsk = tsk;
2532
2533 /* avoid getting signals */
2534 spin_lock_irq(&tsk->sigmask_lock);
2535 flush_signals(tsk);
2536 sigfillset(&tsk->blocked);
2537 recalc_sigpending(tsk);
2538 spin_unlock_irq(&tsk->sigmask_lock);
2539
2540 up((struct semaphore *)sem);
2541
2542 for(;;) {
2543  CHECK_EMERGENCY_SYNC
2544
2545  flushed =flush_dirty_buffers(0);
2546
2547 /* If wakeup_bdflush will wakeup us
2548  after our bdflush_done wakeup, then
2549  we must make sure to not sleep
2550  in schedule_timeout otherwise
2551  wakeup_bdflush may wait for our
2552  bdflush_done wakeup that would never arrive
2553  (as we would be sleeping) and so it would
2554  deadlock in SMP. */
2555 __set_current_state(TASK_INTERRUPTIBLE);
2556 wake_up(&bdflush_done);
2557 /*
2558  * If there are still a lot of dirty buffers around,
2559  * skip the sleep and flush some more. Otherwise, we
2560  * go to sleep waiting a wakeup.
2561  */
2562 if(!flushed ||balance_dirty_state(NODEV) <0)
2563 schedule();
2564 /* Remember to mark us as running otherwise
2565  the next schedule will block. */
2566 __set_current_state(TASK_RUNNING);
2567 }
2568 }
2569
2570 /*
2571  * This is the kernel update daemon. It was used to live in userspace
2572  * but since it's need to run safely we want it unkillable by mistake.
2573  * You don't need to change your userspace configuration since
2574  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2575  */
2576 intkupdate(void*sem)
2577 {
2578 struct task_struct * tsk = current;
2579 int interval;
2580
2581  tsk->session =1;
2582  tsk->pgrp =1;
2583 strcpy(tsk->comm,"kupdate");
2584
2585 /* sigstop and sigcont will stop and wakeup kupdate */
2586 spin_lock_irq(&tsk->sigmask_lock);
2587 sigfillset(&tsk->blocked);
2588 siginitsetinv(&current->blocked,sigmask(SIGCONT) |sigmask(SIGSTOP));
2589 recalc_sigpending(tsk);
2590 spin_unlock_irq(&tsk->sigmask_lock);
2591
2592 up((struct semaphore *)sem);
2593
2594 for(;;) {
2595 /* update interval */
2596  interval = bdf_prm.b_un.interval;
2597 if(interval) {
2598  tsk->state = TASK_INTERRUPTIBLE;
2599 schedule_timeout(interval);
2600 }else{
2601  stop_kupdate:
2602  tsk->state = TASK_STOPPED;
2603 schedule();/* wait for SIGCONT */
2604 }
2605 /* check for sigstop */
2606 if(signal_pending(tsk)) {
2607 int stopped =0;
2608 spin_lock_irq(&tsk->sigmask_lock);
2609 if(sigismember(&tsk->pending.signal, SIGSTOP)) {
2610 sigdelset(&tsk->pending.signal, SIGSTOP);
2611  stopped =1;
2612 }
2613 recalc_sigpending(tsk);
2614 spin_unlock_irq(&tsk->sigmask_lock);
2615 if(stopped)
2616 goto stop_kupdate;
2617 }
2618 #ifdef DEBUG
2619 printk("kupdate() activated...\n");
2620 #endif
2621 sync_old_buffers();
2622 }
2623 }
2624
2625 static int __init bdflush_init(void)
2626 {
2627 DECLARE_MUTEX_LOCKED(sem);
2628 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2629 down(&sem);
2630 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2631 down(&sem);
2632 return0;
2633 }
2634
2635 module_init(bdflush_init)
2636