fs/buffer.c

Name: Public Git Hosting - davej-history.git/blob - fs/buffer.c
Rating: 4.8 (5807 reviews)
 1 /*
 2  * linux/fs/buffer.c
 3  *
 4  * Copyright (C) 1991, 1992 Linus Torvalds
 5  */
 6
 7 /*
 8  * 'buffer.c' implements the buffer-cache functions. Race-conditions have
 9  * been avoided by NEVER letting an interrupt change a buffer (except for the
 10  * data, of course), but instead letting the caller do it.
 11  */
 12
 13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
 14
 15 /* Removed a lot of unnecessary code and simplified things now that
 16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 17  */
 18
 19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
 20  * hash table, use SLAB cache for buffer heads. -DaveM
 21  */
 22
 23 /* Added 32k buffer block sizes - these are required older ARM systems.
 24  * - RMK
 25  */
 26
 27 /* Thread it... -DaveM */
 28
 29 #include <linux/sched.h>
 30 #include <linux/fs.h>
 31 #include <linux/malloc.h>
 32 #include <linux/locks.h>
 33 #include <linux/errno.h>
 34 #include <linux/swap.h>
 35 #include <linux/swapctl.h>
 36 #include <linux/smp_lock.h>
 37 #include <linux/vmalloc.h>
 38 #include <linux/blkdev.h>
 39 #include <linux/sysrq.h>
 40 #include <linux/file.h>
 41 #include <linux/init.h>
 42 #include <linux/quotaops.h>
 43 #include <linux/iobuf.h>
 44
 45 #include <asm/uaccess.h>
 46 #include <asm/io.h>
 47 #include <asm/bitops.h>
 48 #include <asm/mmu_context.h>
 49
 50 #define NR_SIZES 7
 51 static char buffersize_index[65] =
 52 {-1,0,1, -1,2, -1, -1, -1,3, -1, -1, -1, -1, -1, -1, -1,
 53 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 54 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 55 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
 56 6};
 57
 58 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
 59 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
 60 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
 61 #define MAX_UNUSED_BUFFERS NR_RESERVED+20/* don't ever have more than this
 62  number of unused buffer heads */
 63
 64 /* Anti-deadlock ordering:
 65  * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
 66  */
 67
 68 /*
 69  * Hash table gook..
 70  */
 71 static unsigned int bh_hash_mask =0;
 72 static unsigned int bh_hash_shift =0;
 73 static struct buffer_head **hash_table;
 74 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
 75
 76 static struct buffer_head *lru_list[NR_LIST];
 77 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
 78 static int nr_buffers_type[NR_LIST] = {0,};
 79
 80 static struct buffer_head * unused_list = NULL;
 81 static int nr_unused_buffer_heads =0;
 82 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
 83 staticDECLARE_WAIT_QUEUE_HEAD(buffer_wait);
 84
 85 struct bh_free_head {
 86 struct buffer_head *list;
 87  spinlock_t lock;
 88 };
 89 static struct bh_free_head free_list[NR_SIZES];
 90
 91 static kmem_cache_t *bh_cachep;
 92
 93 static intgrow_buffers(int size);
 94
 95 /* This is used by some architectures to estimate available memory. */
 96 atomic_t buffermem =ATOMIC_INIT(0);
 97
 98 /* Here is the parameter block for the bdflush process. If you add or
 99  * remove any of the parameters, make sure to update kernel/sysctl.c.
 100  */
 101
 102 #define N_PARAM 9
 103
 104 /* The dummy values in this structure are left in there for compatibility
 105  * with old programs that play with the /proc entries.
 106  */
 107 union bdflush_param {
 108 struct{
 109 int nfract;/* Percentage of buffer cache dirty to
 110  activate bdflush */
 111 int ndirty;/* Maximum number of dirty blocks to write out per
 112  wake-cycle */
 113 int nrefill;/* Number of clean buffers to try to obtain
 114  each time we call refill */
 115 int nref_dirt;/* Dirty buffer threshold for activating bdflush
 116  when trying to refill buffers. */
 117 int dummy1;/* unused */
 118 int age_buffer;/* Time for normal buffer to age before we flush it */
 119 int age_super;/* Time for superblock to age before we flush it */
 120 int dummy2;/* unused */
 121 int dummy3;/* unused */
 122 } b_un;
 123 unsigned int data[N_PARAM];
 124 } bdf_prm = {{40,500,64,256,15,30*HZ,5*HZ,1884,2}};
 125
 126 /* These are the min and max parameter values that we will allow to be assigned */
 127 int bdflush_min[N_PARAM] = {0,10,5,25,0,1*HZ,1*HZ,1,1};
 128 int bdflush_max[N_PARAM] = {100,50000,20000,20000,1000,6000*HZ,6000*HZ,2047,5};
 129
 130 voidwakeup_bdflush(int);
 131
 132 /*
 133  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 134  * and getting rid of the cli-sti pairs. The wait-queue routines still
 135  * need cli-sti, but now it's just a couple of 386 instructions or so.
 136  *
 137  * Note that the real wait_on_buffer() is an inline function that checks
 138  * if 'b_wait' is set before calling this, so that the queues aren't set
 139  * up unnecessarily.
 140  */
 141 void__wait_on_buffer(struct buffer_head * bh)
 142 {
 143 struct task_struct *tsk = current;
 144 DECLARE_WAITQUEUE(wait, tsk);
 145
 146 atomic_inc(&bh->b_count);
 147 add_wait_queue(&bh->b_wait, &wait);
 148 repeat:
 149  tsk->state = TASK_UNINTERRUPTIBLE;
 150 run_task_queue(&tq_disk);
 151 if(buffer_locked(bh)) {
 152 schedule();
 153 goto repeat;
 154 }
 155  tsk->state = TASK_RUNNING;
 156 remove_wait_queue(&bh->b_wait, &wait);
 157 atomic_dec(&bh->b_count);
 158 }
 159
 160 /* Call sync_buffers with wait!=0 to ensure that the call does not
 161  * return until all buffer writes have completed. Sync() may return
 162  * before the writes have finished; fsync() may not.
 163  */
 164
 165 /* Godamity-damn. Some buffers (bitmaps for filesystems)
 166  * spontaneously dirty themselves without ever brelse being called.
 167  * We will ultimately want to put these in a separate list, but for
 168  * now we search all of the lists for dirty buffers.
 169  */
 170 static intsync_buffers(kdev_t dev,int wait)
 171 {
 172 int i, retry, pass =0, err =0;
 173 struct buffer_head * bh, *next;
 174
 175 /* One pass for no-wait, three for wait:
 176  * 0) write out all dirty, unlocked buffers;
 177  * 1) write out all dirty buffers, waiting if locked;
 178  * 2) wait for completion by waiting for all buffers to unlock.
 179  */
 180 do{
 181  retry =0;
 182
 183 /* We search all lists as a failsafe mechanism, not because we expect
 184  * there to be dirty buffers on any of the other lists.
 185  */
 186 repeat:
 187 spin_lock(&lru_list_lock);
 188  bh = lru_list[BUF_DIRTY];
 189 if(!bh)
 190 goto repeat2;
 191
 192 for(i = nr_buffers_type[BUF_DIRTY]*2; i-- >0; bh = next) {
 193  next = bh->b_next_free;
 194
 195 if(!lru_list[BUF_DIRTY])
 196 break;
 197 if(dev && bh->b_dev != dev)
 198 continue;
 199 if(buffer_locked(bh)) {
 200 /* Buffer is locked; skip it unless wait is
 201  * requested AND pass > 0.
 202  */
 203 if(!wait || !pass) {
 204  retry =1;
 205 continue;
 206 }
 207 atomic_inc(&bh->b_count);
 208 spin_unlock(&lru_list_lock);
 209 wait_on_buffer(bh);
 210 atomic_dec(&bh->b_count);
 211 goto repeat;
 212 }
 213
 214 /* If an unlocked buffer is not uptodate, there has
 215  * been an IO error. Skip it.
 216  */
 217 if(wait &&buffer_req(bh) && !buffer_locked(bh) &&
 218 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 219  err = -EIO;
 220 continue;
 221 }
 222
 223 /* Don't write clean buffers. Don't write ANY buffers
 224  * on the third pass.
 225  */
 226 if(!buffer_dirty(bh) || pass >=2)
 227 continue;
 228
 229 atomic_inc(&bh->b_count);
 230  bh->b_flushtime =0;
 231 spin_unlock(&lru_list_lock);
 232 ll_rw_block(WRITE,1, &bh);
 233 atomic_dec(&bh->b_count);
 234  retry =1;
 235 goto repeat;
 236 }
 237
 238  repeat2:
 239  bh = lru_list[BUF_LOCKED];
 240 if(!bh) {
 241 spin_unlock(&lru_list_lock);
 242 break;
 243 }
 244 for(i = nr_buffers_type[BUF_LOCKED]*2; i-- >0; bh = next) {
 245  next = bh->b_next_free;
 246
 247 if(!lru_list[BUF_LOCKED])
 248 break;
 249 if(dev && bh->b_dev != dev)
 250 continue;
 251 if(buffer_locked(bh)) {
 252 /* Buffer is locked; skip it unless wait is
 253  * requested AND pass > 0.
 254  */
 255 if(!wait || !pass) {
 256  retry =1;
 257 continue;
 258 }
 259 atomic_inc(&bh->b_count);
 260 spin_unlock(&lru_list_lock);
 261 wait_on_buffer(bh);
 262 spin_lock(&lru_list_lock);
 263 atomic_dec(&bh->b_count);
 264 goto repeat2;
 265 }
 266 }
 267 spin_unlock(&lru_list_lock);
 268
 269 /* If we are waiting for the sync to succeed, and if any dirty
 270  * blocks were written, then repeat; on the second pass, only
 271  * wait for buffers being written (do not pass to write any
 272  * more buffers on the second pass).
 273  */
 274 }while(wait && retry && ++pass<=2);
 275 return err;
 276 }
 277
 278 voidsync_dev(kdev_t dev)
 279 {
 280 sync_buffers(dev,0);
 281 sync_supers(dev);
 282 sync_inodes(dev);
 283 sync_buffers(dev,0);
 284 DQUOT_SYNC(dev);
 285 /*
 286  * FIXME(eric) we need to sync the physical devices here.
 287  * This is because some (scsi) controllers have huge amounts of
 288  * cache onboard (hundreds of Mb), and we need to instruct
 289  * them to commit all of the dirty memory to disk, and we should
 290  * not return until this has happened.
 291  *
 292  * This would need to get implemented by going through the assorted
 293  * layers so that each block major number can be synced, and this
 294  * would call down into the upper and mid-layer scsi.
 295  */
 296 }
 297
 298 intfsync_dev(kdev_t dev)
 299 {
 300 sync_buffers(dev,0);
 301
 302 lock_kernel();
 303 sync_supers(dev);
 304 sync_inodes(dev);
 305 DQUOT_SYNC(dev);
 306 unlock_kernel();
 307
 308 returnsync_buffers(dev,1);
 309 }
 310
 311 asmlinkage intsys_sync(void)
 312 {
 313 fsync_dev(0);
 314 return0;
 315 }
 316
 317 /*
 318  * filp may be NULL if called via the msync of a vma.
 319  */
 320
 321 intfile_fsync(struct file *filp,struct dentry *dentry)
 322 {
 323 struct inode * inode = dentry->d_inode;
 324 struct super_block * sb;
 325  kdev_t dev;
 326
 327 /* sync the inode to buffers */
 328 write_inode_now(inode);
 329
 330 /* sync the superblock to buffers */
 331  sb = inode->i_sb;
 332 wait_on_super(sb);
 333 if(sb->s_op && sb->s_op->write_super)
 334  sb->s_op->write_super(sb);
 335
 336 /* .. finally sync the buffers to disk */
 337  dev = inode->i_dev;
 338 returnsync_buffers(dev,1);
 339 }
 340
 341 asmlinkage intsys_fsync(unsigned int fd)
 342 {
 343 struct file * file;
 344 struct dentry * dentry;
 345 struct inode * inode;
 346 int err;
 347
 348 lock_kernel();
 349  err = -EBADF;
 350  file =fget(fd);
 351 if(!file)
 352 goto out;
 353
 354  dentry = file->f_dentry;
 355 if(!dentry)
 356 goto out_putf;
 357
 358  inode = dentry->d_inode;
 359 if(!inode)
 360 goto out_putf;
 361
 362  err = -EINVAL;
 363 if(!file->f_op || !file->f_op->fsync)
 364 goto out_putf;
 365
 366 /* We need to protect against concurrent writers.. */
 367 down(&inode->i_sem);
 368  err = file->f_op->fsync(file, dentry);
 369 up(&inode->i_sem);
 370
 371 out_putf:
 372 fput(file);
 373 out:
 374 unlock_kernel();
 375 return err;
 376 }
 377
 378 asmlinkage intsys_fdatasync(unsigned int fd)
 379 {
 380 struct file * file;
 381 struct dentry * dentry;
 382 struct inode * inode;
 383 int err;
 384
 385 lock_kernel();
 386  err = -EBADF;
 387  file =fget(fd);
 388 if(!file)
 389 goto out;
 390
 391  dentry = file->f_dentry;
 392 if(!dentry)
 393 goto out_putf;
 394
 395  inode = dentry->d_inode;
 396 if(!inode)
 397 goto out_putf;
 398
 399  err = -EINVAL;
 400 if(!file->f_op || !file->f_op->fsync)
 401 goto out_putf;
 402
 403 /* this needs further work, at the moment it is identical to fsync() */
 404 down(&inode->i_sem);
 405  err = file->f_op->fsync(file, dentry);
 406 up(&inode->i_sem);
 407
 408 out_putf:
 409 fput(file);
 410 out:
 411 unlock_kernel();
 412 return err;
 413 }
 414
 415 voidinvalidate_buffers(kdev_t dev)
 416 {
 417 int nlist;
 418
 419 spin_lock(&lru_list_lock);
 420 for(nlist =0; nlist < NR_LIST; nlist++) {
 421 struct buffer_head * bh;
 422 int i;
 423  retry:
 424  bh = lru_list[nlist];
 425 if(!bh)
 426 continue;
 427 for(i = nr_buffers_type[nlist]*2; --i >0; bh = bh->b_next_free) {
 428 if(bh->b_dev != dev)
 429 continue;
 430 if(buffer_locked(bh)) {
 431 atomic_inc(&bh->b_count);
 432 spin_unlock(&lru_list_lock);
 433 wait_on_buffer(bh);
 434 spin_lock(&lru_list_lock);
 435 atomic_dec(&bh->b_count);
 436 goto retry;
 437 }
 438 if(atomic_read(&bh->b_count))
 439 continue;
 440  bh->b_flushtime =0;
 441 clear_bit(BH_Protected, &bh->b_state);
 442 clear_bit(BH_Uptodate, &bh->b_state);
 443 clear_bit(BH_Dirty, &bh->b_state);
 444 clear_bit(BH_Req, &bh->b_state);
 445 }
 446 }
 447 spin_unlock(&lru_list_lock);
 448 }
 449
 450 /* After several hours of tedious analysis, the following hash
 451  * function won. Do not mess with it... -DaveM
 452  */
 453 #define _hashfn(dev,block) \
 454  ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 455  (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 456 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 457
 458 static __inline__ void__hash_link(struct buffer_head *bh,struct buffer_head **head)
 459 {
 460 if((bh->b_next = *head) != NULL)
 461  bh->b_next->b_pprev = &bh->b_next;
 462 *head = bh;
 463  bh->b_pprev = head;
 464 }
 465
 466 static __inline__ void__hash_unlink(struct buffer_head *bh)
 467 {
 468 if(bh->b_next)
 469  bh->b_next->b_pprev = bh->b_pprev;
 470 *(bh->b_pprev) = bh->b_next;
 471  bh->b_pprev = NULL;
 472 }
 473
 474 static void__insert_into_lru_list(struct buffer_head * bh,int blist)
 475 {
 476 struct buffer_head **bhp = &lru_list[blist];
 477
 478 if(!*bhp) {
 479 *bhp = bh;
 480  bh->b_prev_free = bh;
 481 }
 482  bh->b_next_free = *bhp;
 483  bh->b_prev_free = (*bhp)->b_prev_free;
 484 (*bhp)->b_prev_free->b_next_free = bh;
 485 (*bhp)->b_prev_free = bh;
 486  nr_buffers_type[blist]++;
 487 }
 488
 489 static void__remove_from_lru_list(struct buffer_head * bh,int blist)
 490 {
 491 if(bh->b_prev_free || bh->b_next_free) {
 492  bh->b_prev_free->b_next_free = bh->b_next_free;
 493  bh->b_next_free->b_prev_free = bh->b_prev_free;
 494 if(lru_list[blist] == bh)
 495  lru_list[blist] = bh->b_next_free;
 496 if(lru_list[blist] == bh)
 497  lru_list[blist] = NULL;
 498  bh->b_next_free = bh->b_prev_free = NULL;
 499  nr_buffers_type[blist]--;
 500 }
 501 }
 502
 503 static void__remove_from_free_list(struct buffer_head * bh,int index)
 504 {
 505 if(bh->b_next_free == bh)
 506  free_list[index].list = NULL;
 507 else{
 508  bh->b_prev_free->b_next_free = bh->b_next_free;
 509  bh->b_next_free->b_prev_free = bh->b_prev_free;
 510 if(free_list[index].list == bh)
 511  free_list[index].list = bh->b_next_free;
 512 }
 513  bh->b_next_free = bh->b_prev_free = NULL;
 514 }
 515
 516 /* The following two functions must operate atomically
 517  * because they control the visibility of a buffer head
 518  * to the rest of the kernel.
 519  */
 520 static __inline__ void__remove_from_queues(struct buffer_head *bh)
 521 {
 522 write_lock(&hash_table_lock);
 523 if(bh->b_pprev)
 524 __hash_unlink(bh);
 525 __remove_from_lru_list(bh, bh->b_list);
 526 write_unlock(&hash_table_lock);
 527 }
 528
 529 static voidinsert_into_queues(struct buffer_head *bh)
 530 {
 531 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 532
 533 spin_lock(&lru_list_lock);
 534 write_lock(&hash_table_lock);
 535 __hash_link(bh, head);
 536 __insert_into_lru_list(bh, bh->b_list);
 537 write_unlock(&hash_table_lock);
 538 spin_unlock(&lru_list_lock);
 539 }
 540
 541 /* This function must only run if there are no other
 542  * references _anywhere_ to this buffer head.
 543  */
 544 static voidput_last_free(struct buffer_head * bh)
 545 {
 546 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 547 struct buffer_head **bhp = &head->list;
 548
 549 spin_lock(&head->lock);
 550  bh->b_dev = B_FREE;
 551 if(!*bhp) {
 552 *bhp = bh;
 553  bh->b_prev_free = bh;
 554 }
 555  bh->b_next_free = *bhp;
 556  bh->b_prev_free = (*bhp)->b_prev_free;
 557 (*bhp)->b_prev_free->b_next_free = bh;
 558 (*bhp)->b_prev_free = bh;
 559 spin_unlock(&head->lock);
 560 }
 561
 562 /*
 563  * Why like this, I hear you say... The reason is race-conditions.
 564  * As we don't lock buffers (unless we are reading them, that is),
 565  * something might happen to it while we sleep (ie a read-error
 566  * will force it bad). This shouldn't really happen currently, but
 567  * the code is ready.
 568  */
 569 struct buffer_head *get_hash_table(kdev_t dev,int block,int size)
 570 {
 571 struct buffer_head **head = &hash(dev, block);
 572 struct buffer_head *bh;
 573
 574 read_lock(&hash_table_lock);
 575 for(bh = *head; bh; bh = bh->b_next)
 576 if(bh->b_blocknr == block &&
 577  bh->b_size == size &&
 578  bh->b_dev == dev)
 579 break;
 580 if(bh)
 581 atomic_inc(&bh->b_count);
 582 read_unlock(&hash_table_lock);
 583
 584 return bh;
 585 }
 586
 587 unsigned intget_hardblocksize(kdev_t dev)
 588 {
 589 /*
 590  * Get the hard sector size for the given device. If we don't know
 591  * what it is, return 0.
 592  */
 593 if(hardsect_size[MAJOR(dev)] != NULL) {
 594 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 595 if(blksize !=0)
 596 return blksize;
 597 }
 598
 599 /*
 600  * We don't know what the hardware sector size for this device is.
 601  * Return 0 indicating that we don't know.
 602  */
 603 return0;
 604 }
 605
 606 voidset_blocksize(kdev_t dev,int size)
 607 {
 608 externint*blksize_size[];
 609 int i, nlist;
 610 struct buffer_head * bh, *bhnext;
 611
 612 if(!blksize_size[MAJOR(dev)])
 613 return;
 614
 615 /* Size must be a power of two, and between 512 and PAGE_SIZE */
 616 if(size > PAGE_SIZE || size <512|| (size & (size-1)))
 617 panic("Invalid blocksize passed to set_blocksize");
 618
 619 if(blksize_size[MAJOR(dev)][MINOR(dev)] ==0&& size == BLOCK_SIZE) {
 620  blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 621 return;
 622 }
 623 if(blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 624 return;
 625 sync_buffers(dev,2);
 626  blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 627
 628 /* We need to be quite careful how we do this - we are moving entries
 629  * around on the free list, and we can get in a loop if we are not careful.
 630  */
 631 for(nlist =0; nlist < NR_LIST; nlist++) {
 632  repeat:
 633 spin_lock(&lru_list_lock);
 634  bh = lru_list[nlist];
 635 for(i = nr_buffers_type[nlist]*2; --i >0; bh = bhnext) {
 636 if(!bh)
 637 break;
 638
 639  bhnext = bh->b_next_free;
 640 if(bh->b_dev != dev)
 641 continue;
 642 if(bh->b_size == size)
 643 continue;
 644 if(buffer_locked(bh)) {
 645 atomic_inc(&bh->b_count);
 646 spin_unlock(&lru_list_lock);
 647 wait_on_buffer(bh);
 648 atomic_dec(&bh->b_count);
 649 goto repeat;
 650 }
 651 if(bh->b_dev == dev && bh->b_size != size) {
 652 clear_bit(BH_Dirty, &bh->b_state);
 653 clear_bit(BH_Uptodate, &bh->b_state);
 654 clear_bit(BH_Req, &bh->b_state);
 655  bh->b_flushtime =0;
 656 }
 657 if(atomic_read(&bh->b_count) ==0) {
 658 __remove_from_queues(bh);
 659 put_last_free(bh);
 660 }
 661 }
 662 spin_unlock(&lru_list_lock);
 663 }
 664 }
 665
 666 /*
 667  * We used to try various strange things. Let's not.
 668  */
 669 static voidrefill_freelist(int size)
 670 {
 671 if(!grow_buffers(size)) {
 672 wakeup_bdflush(1);
 673  current->policy |= SCHED_YIELD;
 674 schedule();
 675 }
 676 }
 677
 678 voidinit_buffer(struct buffer_head *bh, bh_end_io_t *handler,void*dev_id)
 679 {
 680  bh->b_list = BUF_CLEAN;
 681  bh->b_flushtime =0;
 682  bh->b_end_io = handler;
 683  bh->b_dev_id = dev_id;
 684 }
 685
 686 static voidend_buffer_io_sync(struct buffer_head *bh,int uptodate)
 687 {
 688 mark_buffer_uptodate(bh, uptodate);
 689 unlock_buffer(bh);
 690 }
 691
 692 static voidend_buffer_io_bad(struct buffer_head *bh,int uptodate)
 693 {
 694 mark_buffer_uptodate(bh, uptodate);
 695 unlock_buffer(bh);
 696 BUG();
 697 }
 698
 699 static voidend_buffer_io_async(struct buffer_head * bh,int uptodate)
 700 {
 701 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 702 unsigned long flags;
 703 struct buffer_head *tmp;
 704 struct page *page;
 705 int free;
 706
 707 mark_buffer_uptodate(bh, uptodate);
 708
 709 /* This is a temporary buffer used for page I/O. */
 710  page = mem_map +MAP_NR(bh->b_data);
 711
 712 if(!uptodate)
 713 SetPageError(page);
 714
 715 /*
 716  * Be _very_ careful from here on. Bad things can happen if
 717  * two buffer heads end IO at almost the same time and both
 718  * decide that the page is now completely done.
 719  *
 720  * Async buffer_heads are here only as labels for IO, and get
 721  * thrown away once the IO for this page is complete. IO is
 722  * deemed complete once all buffers have been visited
 723  * (b_count==0) and are now unlocked. We must make sure that
 724  * only the _last_ buffer that decrements its count is the one
 725  * that free's the page..
 726  */
 727 spin_lock_irqsave(&page_uptodate_lock, flags);
 728 unlock_buffer(bh);
 729 atomic_dec(&bh->b_count);
 730  tmp = bh->b_this_page;
 731 while(tmp != bh) {
 732 if(atomic_read(&tmp->b_count) &&
 733 (tmp->b_end_io == end_buffer_io_async))
 734 goto still_busy;
 735  tmp = tmp->b_this_page;
 736 }
 737
 738 /* OK, the async IO on this page is complete. */
 739 spin_unlock_irqrestore(&page_uptodate_lock, flags);
 740
 741 /*
 742  * if none of the buffers had errors then we can set the
 743  * page uptodate:
 744  */
 745 if(!PageError(page))
 746 SetPageUptodate(page);
 747
 748 /*
 749  * Run the hooks that have to be done when a page I/O has completed.
 750  *
 751  * Note - we need to test the flags before we unlock the page, but
 752  * we must not actually free the page until after the unlock!
 753  */
 754 if(test_and_clear_bit(PG_decr_after, &page->flags))
 755 atomic_dec(&nr_async_pages);
 756
 757 if(test_and_clear_bit(PG_free_swap_after, &page->flags))
 758 swap_free(page->offset);
 759
 760  free =test_and_clear_bit(PG_free_after, &page->flags);
 761
 762 if(page->owner != (void*)-1)
 763 PAGE_BUG(page);
 764  page->owner = current;
 765 UnlockPage(page);
 766
 767 if(free)
 768 __free_page(page);
 769
 770 return;
 771
 772 still_busy:
 773 spin_unlock_irqrestore(&page_uptodate_lock, flags);
 774 return;
 775 }
 776
 777
 778 /*
 779  * Ok, this is getblk, and it isn't very clear, again to hinder
 780  * race-conditions. Most of the code is seldom used, (ie repeating),
 781  * so it should be much more efficient than it looks.
 782  *
 783  * The algorithm is changed: hopefully better, and an elusive bug removed.
 784  *
 785  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 786  * when the filesystem starts to get full of dirty blocks (I hope).
 787  */
 788 struct buffer_head *getblk(kdev_t dev,int block,int size)
 789 {
 790 struct buffer_head * bh;
 791 int isize;
 792
 793 repeat:
 794  bh =get_hash_table(dev, block, size);
 795 if(bh) {
 796 if(!buffer_dirty(bh)) {
 797  bh->b_flushtime =0;
 798 }
 799 goto out;
 800 }
 801
 802  isize =BUFSIZE_INDEX(size);
 803 spin_lock(&free_list[isize].lock);
 804  bh = free_list[isize].list;
 805 if(bh) {
 806 __remove_from_free_list(bh, isize);
 807 atomic_set(&bh->b_count,1);
 808 }
 809 spin_unlock(&free_list[isize].lock);
 810 if(!bh)
 811 goto refill;
 812
 813 /* OK, FINALLY we know that this buffer is the only one of its kind,
 814  * we hold a reference (b_count>0), it is unlocked, and it is clean.
 815  */
 816 init_buffer(bh, end_buffer_io_sync, NULL);
 817  bh->b_dev = dev;
 818  bh->b_blocknr = block;
 819  bh->b_state =1<< BH_Mapped;
 820
 821 /* Insert the buffer into the regular lists */
 822 insert_into_queues(bh);
 823 goto out;
 824
 825 /*
 826  * If we block while refilling the free list, somebody may
 827  * create the buffer first ... search the hashes again.
 828  */
 829 refill:
 830 refill_freelist(size);
 831 goto repeat;
 832 out:
 833 return bh;
 834 }
 835
 836 /*
 837  * if a new dirty buffer is created we need to balance bdflush.
 838  *
 839  * in the future we might want to make bdflush aware of different
 840  * pressures on different devices - thus the (currently unused)
 841  * 'dev' parameter.
 842  */
 843 int too_many_dirty_buffers;
 844
 845 voidbalance_dirty(kdev_t dev)
 846 {
 847 int dirty = nr_buffers_type[BUF_DIRTY];
 848 int ndirty = bdf_prm.b_un.ndirty;
 849
 850 if(dirty > ndirty) {
 851 if(dirty >2*ndirty) {
 852  too_many_dirty_buffers =1;
 853 wakeup_bdflush(1);
 854 return;
 855 }
 856 wakeup_bdflush(0);
 857 }
 858  too_many_dirty_buffers =0;
 859 return;
 860 }
 861
 862 staticinlinevoid__mark_dirty(struct buffer_head *bh,int flag)
 863 {
 864  bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 865 clear_bit(BH_New, &bh->b_state);
 866 refile_buffer(bh);
 867 }
 868
 869 void__mark_buffer_dirty(struct buffer_head *bh,int flag)
 870 {
 871 __mark_dirty(bh, flag);
 872 }
 873
 874 /*
 875  * A buffer may need to be moved from one buffer list to another
 876  * (e.g. in case it is not shared any more). Handle this.
 877  */
 878 static __inline__ void__refile_buffer(struct buffer_head *bh)
 879 {
 880 int dispose = BUF_CLEAN;
 881 if(buffer_locked(bh))
 882  dispose = BUF_LOCKED;
 883 if(buffer_dirty(bh))
 884  dispose = BUF_DIRTY;
 885 if(dispose != bh->b_list) {
 886 __remove_from_lru_list(bh, bh->b_list);
 887  bh->b_list = dispose;
 888 __insert_into_lru_list(bh, dispose);
 889 }
 890 }
 891
 892 voidrefile_buffer(struct buffer_head *bh)
 893 {
 894 spin_lock(&lru_list_lock);
 895 __refile_buffer(bh);
 896 spin_unlock(&lru_list_lock);
 897 }
 898
 899 /*
 900  * Release a buffer head
 901  */
 902 void__brelse(struct buffer_head * buf)
 903 {
 904 touch_buffer(buf);
 905
 906 if(atomic_read(&buf->b_count)) {
 907 atomic_dec(&buf->b_count);
 908 return;
 909 }
 910 printk("VFS: brelse: Trying to free free buffer\n");
 911 }
 912
 913 /*
 914  * bforget() is like brelse(), except it puts the buffer on the
 915  * free list if it can.. We can NOT free the buffer if:
 916  * - there are other users of it
 917  * - it is locked and thus can have active IO
 918  */
 919 void__bforget(struct buffer_head * buf)
 920 {
 921 spin_lock(&lru_list_lock);
 922 write_lock(&hash_table_lock);
 923 if(atomic_read(&buf->b_count) !=1||buffer_locked(buf)) {
 924 touch_buffer(buf);
 925 atomic_dec(&buf->b_count);
 926 }else{
 927 atomic_set(&buf->b_count,0);
 928  buf->b_state =0;
 929 if(buf->b_pprev)
 930 __hash_unlink(buf);
 931 __remove_from_lru_list(buf, buf->b_list);
 932 put_last_free(buf);
 933 }
 934 write_unlock(&hash_table_lock);
 935 spin_unlock(&lru_list_lock);
 936 }
 937
 938 /*
 939  * bread() reads a specified block and returns the buffer that contains
 940  * it. It returns NULL if the block was unreadable.
 941  */
 942 struct buffer_head *bread(kdev_t dev,int block,int size)
 943 {
 944 struct buffer_head * bh;
 945
 946  bh =getblk(dev, block, size);
 947 if(buffer_uptodate(bh))
 948 return bh;
 949 ll_rw_block(READ,1, &bh);
 950 wait_on_buffer(bh);
 951 if(buffer_uptodate(bh))
 952 return bh;
 953 brelse(bh);
 954 return NULL;
 955 }
 956
 957 /*
 958  * Ok, breada can be used as bread, but additionally to mark other
 959  * blocks for reading as well. End the argument list with a negative
 960  * number.
 961  */
 962
 963 #define NBUF 16
 964
 965 struct buffer_head *breada(kdev_t dev,int block,int bufsize,
 966 unsigned int pos,unsigned int filesize)
 967 {
 968 struct buffer_head * bhlist[NBUF];
 969 unsigned int blocks;
 970 struct buffer_head * bh;
 971 int index;
 972 int i, j;
 973
 974 if(pos >= filesize)
 975 return NULL;
 976
 977 if(block <0)
 978 return NULL;
 979
 980  bh =getblk(dev, block, bufsize);
 981  index =BUFSIZE_INDEX(bh->b_size);
 982
 983 if(buffer_uptodate(bh))
 984 return(bh);
 985 elsell_rw_block(READ,1, &bh);
 986
 987  blocks = (filesize - pos) >> (9+index);
 988
 989 if(blocks < (read_ahead[MAJOR(dev)] >> index))
 990  blocks = read_ahead[MAJOR(dev)] >> index;
 991 if(blocks > NBUF)
 992  blocks = NBUF;
 993
 994 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
 995
 996  bhlist[0] = bh;
 997  j =1;
 998 for(i=1; i<blocks; i++) {
 999  bh =getblk(dev,block+i,bufsize);
1000 if(buffer_uptodate(bh)) {
1001 brelse(bh);
1002 break;
1003 }
1004 else bhlist[j++] = bh;
1005 }
1006
1007 /* Request the read for these buffers, and then release them. */
1008 if(j>1)
1009 ll_rw_block(READA, (j-1), bhlist+1);
1010 for(i=1; i<j; i++)
1011 brelse(bhlist[i]);
1012
1013 /* Wait for this buffer, and then continue on. */
1014  bh = bhlist[0];
1015 wait_on_buffer(bh);
1016 if(buffer_uptodate(bh))
1017 return bh;
1018 brelse(bh);
1019 return NULL;
1020 }
1021
1022 /*
1023  * Note: the caller should wake up the buffer_wait list if needed.
1024  */
1025 static __inline__ void__put_unused_buffer_head(struct buffer_head * bh)
1026 {
1027 if(nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1028 kmem_cache_free(bh_cachep, bh);
1029 }else{
1030  bh->b_blocknr = -1;
1031 init_waitqueue_head(&bh->b_wait);
1032  nr_unused_buffer_heads++;
1033  bh->b_next_free = unused_list;
1034  bh->b_this_page = NULL;
1035  unused_list = bh;
1036 }
1037 }
1038
1039 static voidput_unused_buffer_head(struct buffer_head *bh)
1040 {
1041 spin_lock(&unused_list_lock);
1042 __put_unused_buffer_head(bh);
1043 spin_unlock(&unused_list_lock);
1044 }
1045
1046 /*
1047  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1048  * no-buffer-head deadlock. Return NULL on failure; waiting for
1049  * buffer heads is now handled in create_buffers().
1050  */
1051 static struct buffer_head *get_unused_buffer_head(int async)
1052 {
1053 struct buffer_head * bh;
1054
1055 spin_lock(&unused_list_lock);
1056 if(nr_unused_buffer_heads > NR_RESERVED) {
1057  bh = unused_list;
1058  unused_list = bh->b_next_free;
1059  nr_unused_buffer_heads--;
1060 spin_unlock(&unused_list_lock);
1061 return bh;
1062 }
1063 spin_unlock(&unused_list_lock);
1064
1065 /* This is critical. We can't swap out pages to get
1066  * more buffer heads, because the swap-out may need
1067  * more buffer-heads itself. Thus SLAB_BUFFER.
1068  */
1069 if((bh =kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1070 memset(bh,0,sizeof(*bh));
1071 init_waitqueue_head(&bh->b_wait);
1072 return bh;
1073 }
1074
1075 /*
1076  * If we need an async buffer, use the reserved buffer heads.
1077  */
1078 if(async) {
1079 spin_lock(&unused_list_lock);
1080 if(unused_list) {
1081  bh = unused_list;
1082  unused_list = bh->b_next_free;
1083  nr_unused_buffer_heads--;
1084 spin_unlock(&unused_list_lock);
1085 return bh;
1086 }
1087 spin_unlock(&unused_list_lock);
1088 }
1089 #if 0
1090 /*
1091  * (Pending further analysis ...)
1092  * Ordinary (non-async) requests can use a different memory priority
1093  * to free up pages. Any swapping thus generated will use async
1094  * buffer heads.
1095  */
1096 if(!async &&
1097 (bh =kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1098 memset(bh,0,sizeof(*bh));
1099 init_waitqueue_head(&bh->b_wait);
1100 return bh;
1101 }
1102 #endif
1103
1104 return NULL;
1105 }
1106
1107 /*
1108  * Create the appropriate buffers when given a page for data area and
1109  * the size of each buffer.. Use the bh->b_this_page linked list to
1110  * follow the buffers created. Return NULL if unable to create more
1111  * buffers.
1112  * The async flag is used to differentiate async IO (paging, swapping)
1113  * from ordinary buffer allocations, and only async requests are allowed
1114  * to sleep waiting for buffer heads.
1115  */
1116 static struct buffer_head *create_buffers(unsigned long page,unsigned long size,int async)
1117 {
1118 DECLARE_WAITQUEUE(wait, current);
1119 struct buffer_head *bh, *head;
1120 long offset;
1121
1122 try_again:
1123  head = NULL;
1124  offset = PAGE_SIZE;
1125 while((offset -= size) >=0) {
1126  bh =get_unused_buffer_head(async);
1127 if(!bh)
1128 goto no_grow;
1129
1130  bh->b_dev = B_FREE;/* Flag as unused */
1131  bh->b_this_page = head;
1132  head = bh;
1133
1134  bh->b_state =0;
1135  bh->b_next_free = NULL;
1136  bh->b_pprev = NULL;
1137 atomic_set(&bh->b_count,0);
1138  bh->b_size = size;
1139
1140  bh->b_data = (char*) (page+offset);
1141  bh->b_list = BUF_CLEAN;
1142  bh->b_flushtime =0;
1143  bh->b_end_io = end_buffer_io_bad;
1144 }
1145 return head;
1146 /*
1147  * In case anything failed, we just free everything we got.
1148  */
1149 no_grow:
1150 if(head) {
1151 spin_lock(&unused_list_lock);
1152 do{
1153  bh = head;
1154  head = head->b_this_page;
1155 __put_unused_buffer_head(bh);
1156 }while(head);
1157 spin_unlock(&unused_list_lock);
1158
1159 /* Wake up any waiters ... */
1160 wake_up(&buffer_wait);
1161 }
1162
1163 /*
1164  * Return failure for non-async IO requests. Async IO requests
1165  * are not allowed to fail, so we have to wait until buffer heads
1166  * become available. But we don't want tasks sleeping with
1167  * partially complete buffers, so all were released above.
1168  */
1169 if(!async)
1170 return NULL;
1171
1172 /* We're _really_ low on memory. Now we just
1173  * wait for old buffer heads to become free due to
1174  * finishing IO. Since this is an async request and
1175  * the reserve list is empty, we're sure there are
1176  * async buffer heads in use.
1177  */
1178 run_task_queue(&tq_disk);
1179
1180 /*
1181  * Set our state for sleeping, then check again for buffer heads.
1182  * This ensures we won't miss a wake_up from an interrupt.
1183  */
1184 add_wait_queue(&buffer_wait, &wait);
1185  current->state = TASK_UNINTERRUPTIBLE;
1186 if(nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1187  current->policy |= SCHED_YIELD;
1188 schedule();
1189 }
1190 remove_wait_queue(&buffer_wait, &wait);
1191  current->state = TASK_RUNNING;
1192 goto try_again;
1193 }
1194
1195 static intcreate_page_buffers(int rw,struct page *page, kdev_t dev,int b[],int size,int bmap)
1196 {
1197 struct buffer_head *head, *bh, *tail;
1198 int block;
1199
1200 if(!PageLocked(page))
1201 BUG();
1202 if(page->owner != current)
1203 PAGE_BUG(page);
1204 /*
1205  * Allocate async buffer heads pointing to this page, just for I/O.
1206  * They don't show up in the buffer hash table, but they *are*
1207  * registered in page->buffers.
1208  */
1209  head =create_buffers(page_address(page), size,1);
1210 if(page->buffers)
1211 BUG();
1212 if(!head)
1213 BUG();
1214  tail = head;
1215 for(bh = head; bh; bh = bh->b_this_page) {
1216  block = *(b++);
1217
1218  tail = bh;
1219 init_buffer(bh, end_buffer_io_async, NULL);
1220  bh->b_dev = dev;
1221  bh->b_blocknr = block;
1222
1223 /*
1224  * When we use bmap, we define block zero to represent
1225  * a hole. ll_rw_page, however, may legitimately
1226  * access block zero, and we need to distinguish the
1227  * two cases.
1228  */
1229 if(bmap && !block) {
1230 memset(bh->b_data,0, size);
1231 set_bit(BH_Uptodate, &bh->b_state);
1232 continue;
1233 }
1234 set_bit(BH_Mapped, &bh->b_state);
1235 }
1236  tail->b_this_page = head;
1237 get_page(page);
1238  page->buffers = head;
1239 return0;
1240 }
1241
1242 /*
1243  * We don't have to release all buffers here, but
1244  * we have to be sure that no dirty buffer is left
1245  * and no IO is going on (no buffer is locked), because
1246  * we have truncated the file and are going to free the
1247  * blocks on-disk..
1248  */
1249 intblock_flushpage(struct inode *inode,struct page *page,unsigned long offset)
1250 {
1251 struct buffer_head *head, *bh, *next;
1252 unsigned int curr_off =0;
1253
1254 if(!PageLocked(page))
1255 BUG();
1256 if(!page->buffers)
1257 return0;
1258
1259  head = page->buffers;
1260  bh = head;
1261 do{
1262 unsigned int next_off = curr_off + bh->b_size;
1263  next = bh->b_this_page;
1264
1265 /*
1266  * is this block fully flushed?
1267  */
1268 if(offset <= curr_off) {
1269 if(buffer_mapped(bh)) {
1270 atomic_inc(&bh->b_count);
1271 wait_on_buffer(bh);
1272 if(bh->b_dev == B_FREE)
1273 BUG();
1274 mark_buffer_clean(bh);
1275 clear_bit(BH_Uptodate, &bh->b_state);
1276 clear_bit(BH_Mapped, &bh->b_state);
1277 clear_bit(BH_Req, &bh->b_state);
1278  bh->b_blocknr =0;
1279 atomic_dec(&bh->b_count);
1280 }
1281 }
1282  curr_off = next_off;
1283  bh = next;
1284 }while(bh != head);
1285
1286 /*
1287  * subtle. We release buffer-heads only if this is
1288  * the 'final' flushpage. We have invalidated the bmap
1289  * cached value unconditionally, so real IO is not
1290  * possible anymore.
1291  *
1292  * If the free doesn't work out, the buffers can be
1293  * left around - they just turn into anonymous buffers
1294  * instead.
1295  */
1296 if(!offset) {
1297 if(!try_to_free_buffers(page))
1298 atomic_add(PAGE_CACHE_SIZE, &buffermem);
1299 }
1300
1301 return0;
1302 }
1303
1304 static voidcreate_empty_buffers(struct page *page,struct inode *inode,unsigned long blocksize)
1305 {
1306 struct buffer_head *bh, *head, *tail;
1307
1308  head =create_buffers(page_address(page), blocksize,1);
1309 if(page->buffers)
1310 BUG();
1311
1312  bh = head;
1313 do{
1314  bh->b_dev = inode->i_dev;
1315  bh->b_blocknr =0;
1316  bh->b_end_io = end_buffer_io_bad;
1317  tail = bh;
1318  bh = bh->b_this_page;
1319 }while(bh);
1320  tail->b_this_page = head;
1321  page->buffers = head;
1322 get_page(page);
1323 }
1324
1325 /*
1326  * block_write_full_page() is SMP-safe - currently it's still
1327  * being called with the kernel lock held, but the code is ready.
1328  */
1329 intblock_write_full_page(struct file *file,struct page *page)
1330 {
1331 struct dentry *dentry = file->f_dentry;
1332 struct inode *inode = dentry->d_inode;
1333 int err, i;
1334 unsigned long block, offset;
1335 struct buffer_head *bh, *head;
1336
1337 if(!PageLocked(page))
1338 BUG();
1339
1340 if(!page->buffers)
1341 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1342  head = page->buffers;
1343
1344  offset = page->offset;
1345  block = offset >> inode->i_sb->s_blocksize_bits;
1346
1347 // FIXME: currently we assume page alignment.
1348 if(offset & (PAGE_SIZE-1))
1349 BUG();
1350
1351  bh = head;
1352  i =0;
1353 do{
1354 if(!bh)
1355 BUG();
1356
1357 /*
1358  * If the buffer isn't up-to-date, we can't be sure
1359  * that the buffer has been initialized with the proper
1360  * block number information etc..
1361  *
1362  * Leave it to the low-level FS to make all those
1363  * decisions (block #0 may actually be a valid block)
1364  */
1365  bh->b_end_io = end_buffer_io_sync;
1366 if(!buffer_mapped(bh)) {
1367  err = inode->i_op->get_block(inode, block, bh,1);
1368 if(err)
1369 goto out;
1370 }
1371 set_bit(BH_Uptodate, &bh->b_state);
1372 mark_buffer_dirty(bh,0);
1373
1374  bh = bh->b_this_page;
1375  block++;
1376 }while(bh != head);
1377
1378 SetPageUptodate(page);
1379 return0;
1380 out:
1381 ClearPageUptodate(page);
1382 return err;
1383 }
1384
1385 intblock_write_partial_page(struct file *file,struct page *page,unsigned long offset,unsigned long bytes,const char* buf)
1386 {
1387 struct dentry *dentry = file->f_dentry;
1388 struct inode *inode = dentry->d_inode;
1389 unsigned long block;
1390 int err, partial;
1391 unsigned long blocksize, start_block, end_block;
1392 unsigned long start_offset, start_bytes, end_bytes;
1393 unsigned long bbits, blocks, i, len;
1394 struct buffer_head *bh, *head;
1395 char* target_buf;
1396
1397  target_buf = (char*)page_address(page) + offset;
1398
1399 if(!PageLocked(page))
1400 BUG();
1401
1402  blocksize = inode->i_sb->s_blocksize;
1403 if(!page->buffers)
1404 create_empty_buffers(page, inode, blocksize);
1405  head = page->buffers;
1406
1407  bbits = inode->i_sb->s_blocksize_bits;
1408  block = page->offset >> bbits;
1409  blocks = PAGE_SIZE >> bbits;
1410  start_block = offset >> bbits;
1411  end_block = (offset + bytes -1) >> bbits;
1412  start_offset = offset & (blocksize -1);
1413  start_bytes = blocksize - start_offset;
1414 if(start_bytes > bytes)
1415  start_bytes = bytes;
1416  end_bytes = (offset+bytes) & (blocksize -1);
1417 if(end_bytes > bytes)
1418  end_bytes = bytes;
1419
1420 if(offset <0|| offset >= PAGE_SIZE)
1421 BUG();
1422 if(bytes+offset <0|| bytes+offset > PAGE_SIZE)
1423 BUG();
1424 if(start_block <0|| start_block >= blocks)
1425 BUG();
1426 if(end_block <0|| end_block >= blocks)
1427 BUG();
1428 // FIXME: currently we assume page alignment.
1429 if(page->offset & (PAGE_SIZE-1))
1430 BUG();
1431
1432  i =0;
1433  bh = head;
1434  partial =0;
1435 do{
1436 if(!bh)
1437 BUG();
1438
1439 if((i < start_block) || (i > end_block)) {
1440 if(!buffer_uptodate(bh))
1441  partial =1;
1442 goto skip;
1443 }
1444
1445 /*
1446  * If the buffer is not up-to-date, we need to ask the low-level
1447  * FS to do something for us (we used to have assumptions about
1448  * the meaning of b_blocknr etc, that's bad).
1449  *
1450  * If "update" is set, that means that the low-level FS should
1451  * try to make sure that the block is up-to-date because we're
1452  * not going to fill it completely.
1453  */
1454  bh->b_end_io = end_buffer_io_sync;
1455 if(!buffer_mapped(bh)) {
1456  err = inode->i_op->get_block(inode, block, bh,1);
1457 if(err)
1458 goto out;
1459 }
1460
1461 if(!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1462 if(buffer_new(bh)) {
1463 memset(bh->b_data,0, bh->b_size);
1464 }else{
1465 ll_rw_block(READ,1, &bh);
1466 wait_on_buffer(bh);
1467  err = -EIO;
1468 if(!buffer_uptodate(bh))
1469 goto out;
1470 }
1471 }
1472
1473  len = blocksize;
1474 if(start_offset) {
1475  len = start_bytes;
1476  start_offset =0;
1477 }else if(end_bytes && (i == end_block)) {
1478  len = end_bytes;
1479  end_bytes =0;
1480 }
1481  err =copy_from_user(target_buf, buf, len);
1482  target_buf += len;
1483  buf += len;
1484
1485 /*
1486  * we dirty buffers only after copying the data into
1487  * the page - this way we can dirty the buffer even if
1488  * the bh is still doing IO.
1489  *
1490  * NOTE! This also does a direct dirty balace check,
1491  * rather than relying on bdflush just waking up every
1492  * once in a while. This is to catch (and slow down)
1493  * the processes that write tons of buffer..
1494  *
1495  * Note how we do NOT want to do this in the full block
1496  * case: full pages are flushed not by the people who
1497  * dirtied them, but by people who need memory. And we
1498  * should not penalize them for somebody else writing
1499  * lots of dirty pages.
1500  */
1501 set_bit(BH_Uptodate, &bh->b_state);
1502 if(!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1503 __mark_dirty(bh,0);
1504 if(too_many_dirty_buffers)
1505 balance_dirty(bh->b_dev);
1506 }
1507
1508 if(err) {
1509  err = -EFAULT;
1510 goto out;
1511 }
1512
1513 skip:
1514  i++;
1515  block++;
1516  bh = bh->b_this_page;
1517 }while(bh != head);
1518
1519 /*
1520  * is this a partial write that happened to make all buffers
1521  * uptodate then we can optimize away a bogus readpage() for
1522  * the next read(). Here we 'discover' wether the page went
1523  * uptodate as a result of this (potentially partial) write.
1524  */
1525 if(!partial)
1526 SetPageUptodate(page);
1527 return bytes;
1528 out:
1529 ClearPageUptodate(page);
1530 return err;
1531 }
1532
1533
1534 /*
1535  * IO completion routine for a buffer_head being used for kiobuf IO: we
1536  * can't dispatch the kiobuf callback until io_count reaches 0.
1537  */
1538
1539 static voidend_buffer_io_kiobuf(struct buffer_head *bh,int uptodate)
1540 {
1541 struct kiobuf *kiobuf;
1542
1543 mark_buffer_uptodate(bh, uptodate);
1544
1545  kiobuf = bh->b_kiobuf;
1546 if(atomic_dec_and_test(&kiobuf->io_count))
1547  kiobuf->end_io(kiobuf);
1548 if(!uptodate)
1549  kiobuf->errno = -EIO;
1550 }
1551
1552
1553 /*
1554  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1555  * for them to complete. Clean up the buffer_heads afterwards.
1556  */
1557
1558 #define dprintk(x...)
1559
1560 static intdo_kio(struct kiobuf *kiobuf,
1561 int rw,int nr,struct buffer_head *bh[],int size)
1562 {
1563 int iosize;
1564 int i;
1565 struct buffer_head *tmp;
1566
1567 struct task_struct *tsk = current;
1568 DECLARE_WAITQUEUE(wait, tsk);
1569
1570 dprintk("do_kio start %d\n", rw);
1571
1572 if(rw == WRITE)
1573  rw = WRITERAW;
1574 atomic_add(nr, &kiobuf->io_count);
1575  kiobuf->errno =0;
1576 ll_rw_block(rw, nr, bh);
1577
1578 kiobuf_wait_for_io(kiobuf);
1579
1580 spin_lock(&unused_list_lock);
1581
1582  iosize =0;
1583 for(i = nr; --i >=0; ) {
1584  iosize += size;
1585  tmp = bh[i];
1586 if(!buffer_uptodate(tmp)) {
1587 /* We are traversing bh'es in reverse order so
1588  clearing iosize on error calculates the
1589  amount of IO before the first error. */
1590  iosize =0;
1591 }
1592 __put_unused_buffer_head(tmp);
1593 }
1594
1595 spin_unlock(&unused_list_lock);
1596
1597 dprintk("do_kio end %d %d\n", iosize, err);
1598
1599 if(iosize)
1600 return iosize;
1601 if(kiobuf->errno)
1602 return kiobuf->errno;
1603 return-EIO;
1604 }
1605
1606 /*
1607  * Start I/O on a physical range of kernel memory, defined by a vector
1608  * of kiobuf structs (much like a user-space iovec list).
1609  *
1610  * The kiobuf must already be locked for IO. IO is submitted
1611  * asynchronously: you need to check page->locked, page->uptodate, and
1612  * maybe wait on page->wait.
1613  *
1614  * It is up to the caller to make sure that there are enough blocks
1615  * passed in to completely map the iobufs to disk.
1616  */
1617
1618 intbrw_kiovec(int rw,int nr,struct kiobuf *iovec[],
1619  kdev_t dev,unsigned long b[],int size,int bmap)
1620 {
1621 int err;
1622 int length;
1623 int transferred;
1624 int i;
1625 int bufind;
1626 int pageind;
1627 int bhind;
1628 int offset;
1629 unsigned long blocknr;
1630 struct kiobuf * iobuf = NULL;
1631 unsigned long page;
1632 struct page * map;
1633 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1634
1635 if(!nr)
1636 return0;
1637
1638 /*
1639  * First, do some alignment and validity checks
1640  */
1641 for(i =0; i < nr; i++) {
1642  iobuf = iovec[i];
1643 if((iobuf->offset & (size-1)) ||
1644 (iobuf->length & (size-1)))
1645 return-EINVAL;
1646 if(!iobuf->locked)
1647 panic("brw_kiovec: iobuf not locked for I/O");
1648 if(!iobuf->nr_pages)
1649 panic("brw_kiovec: iobuf not initialised");
1650 }
1651
1652 /* DEBUG */
1653 #if 0
1654 return iobuf->length;
1655 #endif
1656 dprintk("brw_kiovec: start\n");
1657
1658 /*
1659  * OK to walk down the iovec doing page IO on each page we find.
1660  */
1661  bufind = bhind = transferred = err =0;
1662 for(i =0; i < nr; i++) {
1663  iobuf = iovec[i];
1664  offset = iobuf->offset;
1665  length = iobuf->length;
1666 dprintk("iobuf %d %d %d\n", offset, length, size);
1667
1668 for(pageind =0; pageind < iobuf->nr_pages; pageind++) {
1669  page = iobuf->pagelist[pageind];
1670  map = iobuf->maplist[pageind];
1671
1672 while(length >0) {
1673  blocknr = b[bufind++];
1674  tmp =get_unused_buffer_head(0);
1675 if(!tmp) {
1676  err = -ENOMEM;
1677 goto error;
1678 }
1679
1680  tmp->b_dev = B_FREE;
1681  tmp->b_size = size;
1682  tmp->b_data = (char*) (page + offset);
1683  tmp->b_this_page = tmp;
1684
1685 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1686  tmp->b_dev = dev;
1687  tmp->b_blocknr = blocknr;
1688  tmp->b_state =1<< BH_Mapped;
1689  tmp->b_kiobuf = iobuf;
1690
1691 if(rw == WRITE) {
1692 set_bit(BH_Uptodate, &tmp->b_state);
1693 set_bit(BH_Dirty, &tmp->b_state);
1694 }
1695
1696 dprintk("buffer %d (%d) at %p\n",
1697  bhind, tmp->b_blocknr, tmp->b_data);
1698  bh[bhind++] = tmp;
1699  length -= size;
1700  offset += size;
1701
1702 /*
1703  * Start the IO if we have got too much
1704  */
1705 if(bhind >= KIO_MAX_SECTORS) {
1706  err =do_kio(iobuf, rw, bhind, bh, size);
1707 if(err >=0)
1708  transferred += err;
1709 else
1710 goto finished;
1711  bhind =0;
1712 }
1713
1714 if(offset >= PAGE_SIZE) {
1715  offset =0;
1716 break;
1717 }
1718 }/* End of block loop */
1719 }/* End of page loop */
1720 }/* End of iovec loop */
1721
1722 /* Is there any IO still left to submit? */
1723 if(bhind) {
1724  err =do_kio(iobuf, rw, bhind, bh, size);
1725 if(err >=0)
1726  transferred += err;
1727 else
1728 goto finished;
1729 }
1730
1731  finished:
1732 dprintk("brw_kiovec: end (%d, %d)\n", transferred, err);
1733 if(transferred)
1734 return transferred;
1735 return err;
1736
1737  error:
1738 /* We got an error allocation the bh'es. Just free the current
1739  buffer_heads and exit. */
1740 spin_lock(&unused_list_lock);
1741 for(i = bhind; --i >=0; ) {
1742 __put_unused_buffer_head(bh[bhind]);
1743 }
1744 spin_unlock(&unused_list_lock);
1745 goto finished;
1746 }
1747
1748 /*
1749  * Start I/O on a page.
1750  * This function expects the page to be locked and may return
1751  * before I/O is complete. You then have to check page->locked,
1752  * page->uptodate, and maybe wait on page->wait.
1753  *
1754  * brw_page() is SMP-safe, although it's being called with the
1755  * kernel lock held - but the code is ready.
1756  *
1757  * FIXME: we need a swapper_inode->get_block function to remove
1758  * some of the bmap kludges and interface ugliness here.
1759  */
1760 intbrw_page(int rw,struct page *page, kdev_t dev,int b[],int size,int bmap)
1761 {
1762 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1763 int nr, fresh /* temporary debugging flag */, block;
1764
1765 if(!PageLocked(page))
1766 panic("brw_page: page not locked for I/O");
1767 // clear_bit(PG_error, &page->flags);
1768 /*
1769  * We pretty much rely on the page lock for this, because
1770  * create_page_buffers() might sleep.
1771  */
1772  fresh =0;
1773 if(!page->buffers) {
1774 create_page_buffers(rw, page, dev, b, size, bmap);
1775  fresh =1;
1776 }
1777 if(!page->buffers)
1778 BUG();
1779  page->owner = (void*)-1;
1780
1781  head = page->buffers;
1782  bh = head;
1783  nr =0;
1784 do{
1785  block = *(b++);
1786
1787 if(fresh && (atomic_read(&bh->b_count) !=0))
1788 BUG();
1789 if(rw == READ) {
1790 if(!fresh)
1791 BUG();
1792 if(bmap && !block) {
1793 if(block)
1794 BUG();
1795 }else{
1796 if(bmap && !block)
1797 BUG();
1798 if(!buffer_uptodate(bh)) {
1799  arr[nr++] = bh;
1800 atomic_inc(&bh->b_count);
1801 }
1802 }
1803 }else{/* WRITE */
1804 if(!bh->b_blocknr) {
1805 if(!block)
1806 BUG();
1807  bh->b_blocknr = block;
1808 }else{
1809 if(!block)
1810 BUG();
1811 }
1812 set_bit(BH_Uptodate, &bh->b_state);
1813 set_bit(BH_Dirty, &bh->b_state);
1814  arr[nr++] = bh;
1815 atomic_inc(&bh->b_count);
1816 }
1817  bh = bh->b_this_page;
1818 }while(bh != head);
1819 if(rw == READ)
1820 ++current->maj_flt;
1821 if((rw == READ) && nr) {
1822 if(Page_Uptodate(page))
1823 BUG();
1824 ll_rw_block(rw, nr, arr);
1825 }else{
1826 if(!nr && rw == READ) {
1827 SetPageUptodate(page);
1828  page->owner = current;
1829 UnlockPage(page);
1830 }
1831 if(nr && (rw == WRITE))
1832 ll_rw_block(rw, nr, arr);
1833 }
1834 return0;
1835 }
1836
1837 /*
1838  * Generic "read page" function for block devices that have the normal
1839  * bmap functionality. This is most of the block device filesystems.
1840  * Reads the page asynchronously --- the unlock_buffer() and
1841  * mark_buffer_uptodate() functions propagate buffer state into the
1842  * page struct once IO has completed.
1843  */
1844 intblock_read_full_page(struct file * file,struct page * page)
1845 {
1846 struct dentry *dentry = file->f_dentry;
1847 struct inode *inode = dentry->d_inode;
1848 unsigned long iblock;
1849 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1850 unsigned int blocksize, blocks;
1851 int nr;
1852
1853 if(!PageLocked(page))
1854 PAGE_BUG(page);
1855  blocksize = inode->i_sb->s_blocksize;
1856 if(!page->buffers)
1857 create_empty_buffers(page, inode, blocksize);
1858  head = page->buffers;
1859
1860  blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1861  iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1862  page->owner = (void*)-1;
1863  bh = head;
1864  nr =0;
1865
1866 do{
1867 if(buffer_uptodate(bh))
1868 continue;
1869
1870 if(!buffer_mapped(bh)) {
1871  inode->i_op->get_block(inode, iblock, bh,0);
1872 if(!buffer_mapped(bh)) {
1873 memset(bh->b_data,0, blocksize);
1874 set_bit(BH_Uptodate, &bh->b_state);
1875 continue;
1876 }
1877 }
1878
1879 init_buffer(bh, end_buffer_io_async, NULL);
1880 atomic_inc(&bh->b_count);
1881  arr[nr] = bh;
1882  nr++;
1883 }while(iblock++, (bh = bh->b_this_page) != head);
1884
1885 ++current->maj_flt;
1886 if(nr) {
1887 if(Page_Uptodate(page))
1888 BUG();
1889 ll_rw_block(READ, nr, arr);
1890 }else{
1891 /*
1892  * all buffers are uptodate - we can set the page
1893  * uptodate as well.
1894  */
1895 SetPageUptodate(page);
1896  page->owner = current;
1897 UnlockPage(page);
1898 }
1899 return0;
1900 }
1901
1902 /*
1903  * Try to increase the number of buffers available: the size argument
1904  * is used to determine what kind of buffers we want.
1905  */
1906 static intgrow_buffers(int size)
1907 {
1908 unsigned long page;
1909 struct buffer_head *bh, *tmp;
1910 struct buffer_head * insert_point;
1911 int isize;
1912
1913 if((size &511) || (size > PAGE_SIZE)) {
1914 printk("VFS: grow_buffers: size = %d\n",size);
1915 return0;
1916 }
1917
1918 if(!(page =__get_free_page(GFP_BUFFER)))
1919 return0;
1920  bh =create_buffers(page, size,0);
1921 if(!bh) {
1922 free_page(page);
1923 return0;
1924 }
1925
1926  isize =BUFSIZE_INDEX(size);
1927
1928 spin_lock(&free_list[isize].lock);
1929  insert_point = free_list[isize].list;
1930  tmp = bh;
1931 while(1) {
1932 if(insert_point) {
1933  tmp->b_next_free = insert_point->b_next_free;
1934  tmp->b_prev_free = insert_point;
1935  insert_point->b_next_free->b_prev_free = tmp;
1936  insert_point->b_next_free = tmp;
1937 }else{
1938  tmp->b_prev_free = tmp;
1939  tmp->b_next_free = tmp;
1940 }
1941  insert_point = tmp;
1942 if(tmp->b_this_page)
1943  tmp = tmp->b_this_page;
1944 else
1945 break;
1946 }
1947  tmp->b_this_page = bh;
1948  free_list[isize].list = bh;
1949 spin_unlock(&free_list[isize].lock);
1950
1951  mem_map[MAP_NR(page)].buffers = bh;
1952 atomic_add(PAGE_SIZE, &buffermem);
1953 return1;
1954 }
1955
1956 /*
1957  * Can the buffer be thrown out?
1958  */
1959 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1960 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1961
1962 /*
1963  * try_to_free_buffers() checks if all the buffers on this particular page
1964  * are unused, and free's the page if so.
1965  *
1966  * Wake up bdflush() if this fails - if we're running low on memory due
1967  * to dirty buffers, we need to flush them out as quickly as possible.
1968  *
1969  * NOTE: There are quite a number of ways that threads of control can
1970  * obtain a reference to a buffer head within a page. So we must
1971  * lock out all of these paths to cleanly toss the page.
1972  */
1973 inttry_to_free_buffers(struct page * page)
1974 {
1975 struct buffer_head * tmp, * bh = page->buffers;
1976 int index =BUFSIZE_INDEX(bh->b_size);
1977 int ret;
1978
1979 spin_lock(&lru_list_lock);
1980 write_lock(&hash_table_lock);
1981 spin_lock(&free_list[index].lock);
1982  tmp = bh;
1983 do{
1984 struct buffer_head * p = tmp;
1985
1986  tmp = tmp->b_this_page;
1987 if(buffer_busy(p))
1988 goto busy_buffer_page;
1989 }while(tmp != bh);
1990
1991 spin_lock(&unused_list_lock);
1992  tmp = bh;
1993 do{
1994 struct buffer_head * p = tmp;
1995  tmp = tmp->b_this_page;
1996
1997 /* The buffer can be either on the regular
1998  * queues or on the free list..
1999  */
2000 if(p->b_dev == B_FREE) {
2001 __remove_from_free_list(p, index);
2002 }else{
2003 if(p->b_pprev)
2004 __hash_unlink(p);
2005 __remove_from_lru_list(p, p->b_list);
2006 }
2007 __put_unused_buffer_head(p);
2008 }while(tmp != bh);
2009 spin_unlock(&unused_list_lock);
2010
2011 /* Wake up anyone waiting for buffer heads */
2012 wake_up(&buffer_wait);
2013
2014 /* And free the page */
2015  page->buffers = NULL;
2016 __free_page(page);
2017  ret =1;
2018 out:
2019 spin_unlock(&free_list[index].lock);
2020 write_unlock(&hash_table_lock);
2021 spin_unlock(&lru_list_lock);
2022 return ret;
2023
2024 busy_buffer_page:
2025 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2026  too_many_dirty_buffers =1;
2027 wakeup_bdflush(0);
2028  ret =0;
2029 goto out;
2030 }
2031
2032 /* ===================== Init ======================= */
2033
2034 /*
2035  * allocate the hash table and init the free list
2036  * Use gfp() for the hash table to decrease TLB misses, use
2037  * SLAB cache for buffer heads.
2038  */
2039 void __init buffer_init(unsigned long memory_size)
2040 {
2041 int order, i;
2042 unsigned int nr_hash;
2043
2044 /* The buffer cache hash table is less important these days,
2045  * trim it a bit.
2046  */
2047  memory_size >>=14;
2048  memory_size *=sizeof(struct buffer_head *);
2049 for(order =0; (PAGE_SIZE << order) < memory_size; order++)
2050 ;
2051
2052 /* try to allocate something until we get it or we're asking
2053  for something that is really too small */
2054
2055 do{
2056 unsigned long tmp;
2057
2058  nr_hash = (PAGE_SIZE << order) /sizeof(struct buffer_head *);
2059  bh_hash_mask = (nr_hash -1);
2060
2061  tmp = nr_hash;
2062  bh_hash_shift =0;
2063 while((tmp >>=1UL) !=0UL)
2064  bh_hash_shift++;
2065
2066  hash_table = (struct buffer_head **)
2067 __get_free_pages(GFP_ATOMIC, order);
2068 }while(hash_table == NULL && --order >0);
2069 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2070  nr_hash, order, (1UL<<order) * PAGE_SIZE);
2071
2072 if(!hash_table)
2073 panic("Failed to allocate buffer hash table\n");
2074
2075 /* Setup hash chains. */
2076 for(i =0; i < nr_hash; i++)
2077  hash_table[i] = NULL;
2078
2079 /* Setup free lists. */
2080 for(i =0; i < NR_SIZES; i++) {
2081  free_list[i].list = NULL;
2082  free_list[i].lock = SPIN_LOCK_UNLOCKED;
2083 }
2084
2085 /* Setup lru lists. */
2086 for(i =0; i < NR_LIST; i++)
2087  lru_list[i] = NULL;
2088
2089  bh_cachep =kmem_cache_create("buffer_head",
2090 sizeof(struct buffer_head),
2091 0,
2092  SLAB_HWCACHE_ALIGN, NULL, NULL);
2093 if(!bh_cachep)
2094 panic("Cannot create buffer head SLAB cache\n");
2095 }
2096
2097
2098 /* ====================== bdflush support =================== */
2099
2100 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2101  * response to dirty buffers. Once this process is activated, we write back
2102  * a limited number of buffers to the disks and then go back to sleep again.
2103  */
2104 staticDECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2105 staticDECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2106 struct task_struct *bdflush_tsk =0;
2107
2108 voidwakeup_bdflush(int wait)
2109 {
2110 if(current == bdflush_tsk)
2111 return;
2112 if(wait)
2113 run_task_queue(&tq_disk);
2114 wake_up(&bdflush_wait);
2115 if(wait)
2116 sleep_on(&bdflush_done);
2117 }
2118
2119
2120 /*
2121  * Here we attempt to write back old buffers. We also try to flush inodes
2122  * and supers as well, since this function is essentially "update", and
2123  * otherwise there would be no way of ensuring that these quantities ever
2124  * get written back. Ideally, we would have a timestamp on the inodes
2125  * and superblocks so that we could write back only the old ones as well
2126  */
2127
2128 static intsync_old_buffers(void)
2129 {
2130 int nlist;
2131
2132 lock_kernel();
2133 sync_supers(0);
2134 sync_inodes(0);
2135 unlock_kernel();
2136
2137 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2138 struct buffer_head *bh;
2139  repeat:
2140 spin_lock(&lru_list_lock);
2141  bh = lru_list[nlist];
2142 if(bh) {
2143 struct buffer_head *next;
2144 int i;
2145 for(i = nr_buffers_type[nlist]; i-- >0; bh = next) {
2146  next = bh->b_next_free;
2147
2148 /* If the buffer is not on the proper list,
2149  * then refile it.
2150  */
2151 if((nlist == BUF_DIRTY &&
2152 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2153 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2154 __refile_buffer(bh);
2155 continue;
2156 }
2157
2158 if(buffer_locked(bh) || !buffer_dirty(bh))
2159 continue;
2160
2161 /* OK, now we are committed to write it out. */
2162  bh->b_flushtime =0;
2163 atomic_inc(&bh->b_count);
2164 spin_unlock(&lru_list_lock);
2165 ll_rw_block(WRITE,1, &bh);
2166 atomic_dec(&bh->b_count);
2167 goto repeat;
2168 }
2169 }
2170 spin_unlock(&lru_list_lock);
2171 }
2172 run_task_queue(&tq_disk);
2173 return0;
2174 }
2175
2176 /* This is the interface to bdflush. As we get more sophisticated, we can
2177  * pass tuning parameters to this "process", to adjust how it behaves.
2178  * We would want to verify each parameter, however, to make sure that it
2179  * is reasonable. */
2180
2181 asmlinkage intsys_bdflush(int func,long data)
2182 {
2183 if(!capable(CAP_SYS_ADMIN))
2184 return-EPERM;
2185
2186 if(func ==1) {
2187 int error;
2188 struct mm_struct *user_mm;
2189
2190 /*
2191  * bdflush will spend all of it's time in kernel-space,
2192  * without touching user-space, so we can switch it into
2193  * 'lazy TLB mode' to reduce the cost of context-switches
2194  * to and from bdflush.
2195  */
2196  user_mm =start_lazy_tlb();
2197  error =sync_old_buffers();
2198 end_lazy_tlb(user_mm);
2199 return error;
2200 }
2201
2202 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2203 if(func >=2) {
2204 int i = (func-2) >>1;
2205 if(i >=0&& i < N_PARAM) {
2206 if((func &1) ==0)
2207 returnput_user(bdf_prm.data[i], (int*)data);
2208
2209 if(data >= bdflush_min[i] && data <= bdflush_max[i]) {
2210  bdf_prm.data[i] = data;
2211 return0;
2212 }
2213 }
2214 return-EINVAL;
2215 }
2216
2217 /* Having func 0 used to launch the actual bdflush and then never
2218  * return (unless explicitly killed). We return zero here to
2219  * remain semi-compatible with present update(8) programs.
2220  */
2221 return0;
2222 }
2223
2224 /*
2225  * This is the actual bdflush daemon itself. It used to be started from
2226  * the syscall above, but now we launch it ourselves internally with
2227  * kernel_thread(...) directly after the first thread in init/main.c
2228  */
2229 intbdflush(void* unused)
2230 {
2231 /*
2232  * We have a bare-bones task_struct, and really should fill
2233  * in a few more things so "top" and /proc/2/{exe,root,cwd}
2234  * display semi-sane things. Not real crucial though...
2235  */
2236
2237  current->session =1;
2238  current->pgrp =1;
2239 sprintf(current->comm,"kflushd");
2240  bdflush_tsk = current;
2241
2242 for(;;) {
2243 int nlist;
2244
2245  CHECK_EMERGENCY_SYNC
2246
2247 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2248 int nr, major, written =0;
2249 struct buffer_head *next;
2250
2251  repeat:
2252 spin_lock(&lru_list_lock);
2253  next = lru_list[nlist];
2254  nr = nr_buffers_type[nlist];
2255 while(nr-- >0) {
2256 struct buffer_head *bh = next;
2257
2258  next = next->b_next_free;
2259
2260 /* If the buffer is not on the correct list,
2261  * then refile it.
2262  */
2263 if((nlist == BUF_DIRTY &&
2264 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2265 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2266 __refile_buffer(bh);
2267 continue;
2268 }
2269
2270 /* If we aren't in panic mode, don't write out too much
2271  * at a time. Also, don't write out buffers we don't
2272  * really have to write out yet..
2273  */
2274 if(!too_many_dirty_buffers) {
2275 if(written > bdf_prm.b_un.ndirty)
2276 break;
2277 if(time_before(jiffies, bh->b_flushtime))
2278 continue;
2279 }
2280
2281 if(buffer_locked(bh) || !buffer_dirty(bh))
2282 continue;
2283
2284  major =MAJOR(bh->b_dev);
2285  written++;
2286  bh->b_flushtime =0;
2287
2288 /*
2289  * For the loop major we can try to do asynchronous writes,
2290  * but we have to guarantee that we're making some progress..
2291  */
2292 atomic_inc(&bh->b_count);
2293 spin_unlock(&lru_list_lock);
2294 if(major == LOOP_MAJOR && written >1) {
2295 ll_rw_block(WRITEA,1, &bh);
2296 if(buffer_dirty(bh))
2297 --written;
2298 }else
2299 ll_rw_block(WRITE,1, &bh);
2300 atomic_dec(&bh->b_count);
2301 goto repeat;
2302 }
2303 spin_unlock(&lru_list_lock);
2304 }
2305 run_task_queue(&tq_disk);
2306 wake_up(&bdflush_done);
2307
2308 /*
2309  * If there are still a lot of dirty buffers around,
2310  * skip the sleep and flush some more. Otherwise, we
2311  * sleep for a while and mark us as not being in panic
2312  * mode..
2313  */
2314 if(!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2315  too_many_dirty_buffers =0;
2316 spin_lock_irq(&current->sigmask_lock);
2317 flush_signals(current);
2318 spin_unlock_irq(&current->sigmask_lock);
2319 interruptible_sleep_on_timeout(&bdflush_wait,5*HZ);
2320 }
2321 }
2322 }
2323
2324 static int __init bdflush_init(void)
2325 {
2326 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2327 return0;
2328 }
2329
2330 module_init(bdflush_init)
2331