4 * Copyright (C) 1991, 1992 Linus Torvalds 8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have 9 * been avoided by NEVER letting an interrupt change a buffer (except for the 10 * data, of course), but instead letting the caller do it. 13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */ 15 /* Removed a lot of unnecessary code and simplified things now that 16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating 20 * hash table, use SLAB cache for buffer heads. -DaveM 23 /* Added 32k buffer block sizes - these are required older ARM systems. 27 /* Thread it... -DaveM */ 29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */ 31 #include <linux/config.h> 32 #include <linux/sched.h> 34 #include <linux/malloc.h> 35 #include <linux/locks.h> 36 #include <linux/errno.h> 37 #include <linux/swap.h> 38 #include <linux/smp_lock.h> 39 #include <linux/vmalloc.h> 40 #include <linux/blkdev.h> 41 #include <linux/sysrq.h> 42 #include <linux/file.h> 43 #include <linux/init.h> 44 #include <linux/quotaops.h> 45 #include <linux/iobuf.h> 46 #include <linux/highmem.h> 48 #include <asm/uaccess.h> 50 #include <asm/bitops.h> 51 #include <asm/mmu_context.h> 54 static char buffersize_index
[65] = 55 {-1,0,1, -1,2, -1, -1, -1,3, -1, -1, -1, -1, -1, -1, -1, 56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) 62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) 63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE) 64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20/* don't ever have more than this 65 number of unused buffer heads */ 67 /* Anti-deadlock ordering: 68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock 74 static unsigned int bh_hash_mask
; 75 static unsigned int bh_hash_shift
; 76 static struct buffer_head
**hash_table
; 77 static rwlock_t hash_table_lock
= RW_LOCK_UNLOCKED
; 79 static struct buffer_head
*lru_list
[NR_LIST
]; 80 static spinlock_t lru_list_lock
= SPIN_LOCK_UNLOCKED
; 81 static int nr_buffers_type
[NR_LIST
]; 82 static unsigned long size_buffers_type
[NR_LIST
]; 84 static struct buffer_head
* unused_list
; 85 static int nr_unused_buffer_heads
; 86 static spinlock_t unused_list_lock
= SPIN_LOCK_UNLOCKED
; 87 staticDECLARE_WAIT_QUEUE_HEAD(buffer_wait
); 90 struct buffer_head
*list
; 93 static struct bh_free_head free_list
[NR_SIZES
]; 95 static intgrow_buffers(int size
); 96 static void__refile_buffer(struct buffer_head
*); 98 /* This is used by some architectures to estimate available memory. */ 99 atomic_t buffermem_pages
=ATOMIC_INIT(0); 101 /* Here is the parameter block for the bdflush process. If you add or 102 * remove any of the parameters, make sure to update kernel/sysctl.c. 107 /* The dummy values in this structure are left in there for compatibility 108 * with old programs that play with the /proc entries. 110 union bdflush_param
{ 112 int nfract
;/* Percentage of buffer cache dirty to 114 int ndirty
;/* Maximum number of dirty blocks to write out per 116 int nrefill
;/* Number of clean buffers to try to obtain 117 each time we call refill */ 118 int nref_dirt
;/* Dirty buffer threshold for activating bdflush 119 when trying to refill buffers. */ 120 int interval
;/* jiffies delay between kupdate flushes */ 121 int age_buffer
;/* Time for normal buffer to age before we flush it */ 122 int dummy1
;/* unused, was age_super */ 123 int dummy2
;/* unused */ 124 int dummy3
;/* unused */ 126 unsigned int data
[N_PARAM
]; 127 } bdf_prm
= {{40,500,64,256,5*HZ
,30*HZ
,5*HZ
,1884,2}}; 129 /* These are the min and max parameter values that we will allow to be assigned */ 130 int bdflush_min
[N_PARAM
] = {0,10,5,25,0,1*HZ
,1*HZ
,1,1}; 131 int bdflush_max
[N_PARAM
] = {100,50000,20000,20000,600*HZ
,6000*HZ
,6000*HZ
,2047,5}; 134 * Rewrote the wait-routines to use the "new" wait-queue functionality, 135 * and getting rid of the cli-sti pairs. The wait-queue routines still 136 * need cli-sti, but now it's just a couple of 386 instructions or so. 138 * Note that the real wait_on_buffer() is an inline function that checks 139 * if 'b_wait' is set before calling this, so that the queues aren't set 142 void__wait_on_buffer(struct buffer_head
* bh
) 144 struct task_struct
*tsk
= current
; 145 DECLARE_WAITQUEUE(wait
, tsk
); 147 atomic_inc(&bh
->b_count
); 148 add_wait_queue(&bh
->b_wait
, &wait
); 150 run_task_queue(&tq_disk
); 151 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
); 152 if(!buffer_locked(bh
)) 155 }while(buffer_locked(bh
)); 156 tsk
->state
= TASK_RUNNING
; 157 remove_wait_queue(&bh
->b_wait
, &wait
); 158 atomic_dec(&bh
->b_count
); 161 /* Call sync_buffers with wait!=0 to ensure that the call does not 162 * return until all buffer writes have completed. Sync() may return 163 * before the writes have finished; fsync() may not. 166 /* Godamity-damn. Some buffers (bitmaps for filesystems) 167 * spontaneously dirty themselves without ever brelse being called. 168 * We will ultimately want to put these in a separate list, but for 169 * now we search all of the lists for dirty buffers. 171 static intsync_buffers(kdev_t dev
,int wait
) 173 int i
, retry
, pass
=0, err
=0; 174 struct buffer_head
* bh
, *next
; 176 /* One pass for no-wait, three for wait: 177 * 0) write out all dirty, unlocked buffers; 178 * 1) write out all dirty buffers, waiting if locked; 179 * 2) wait for completion by waiting for all buffers to unlock. 184 /* We search all lists as a failsafe mechanism, not because we expect 185 * there to be dirty buffers on any of the other lists. 188 spin_lock(&lru_list_lock
); 189 bh
= lru_list
[BUF_DIRTY
]; 193 for(i
= nr_buffers_type
[BUF_DIRTY
]*2; i
-- >0; bh
= next
) { 194 next
= bh
->b_next_free
; 196 if(!lru_list
[BUF_DIRTY
]) 198 if(dev
&& bh
->b_dev
!= dev
) 200 if(buffer_locked(bh
)) { 201 /* Buffer is locked; skip it unless wait is 202 * requested AND pass > 0. 208 atomic_inc(&bh
->b_count
); 209 spin_unlock(&lru_list_lock
); 211 atomic_dec(&bh
->b_count
); 215 /* If an unlocked buffer is not uptodate, there has 216 * been an IO error. Skip it. 218 if(wait
&&buffer_req(bh
) && !buffer_locked(bh
) && 219 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) { 224 /* Don't write clean buffers. Don't write ANY buffers 227 if(!buffer_dirty(bh
) || pass
>=2) 230 atomic_inc(&bh
->b_count
); 231 spin_unlock(&lru_list_lock
); 232 ll_rw_block(WRITE
,1, &bh
); 233 atomic_dec(&bh
->b_count
); 239 bh
= lru_list
[BUF_LOCKED
]; 241 spin_unlock(&lru_list_lock
); 244 for(i
= nr_buffers_type
[BUF_LOCKED
]*2; i
-- >0; bh
= next
) { 245 next
= bh
->b_next_free
; 247 if(!lru_list
[BUF_LOCKED
]) 249 if(dev
&& bh
->b_dev
!= dev
) 251 if(buffer_locked(bh
)) { 252 /* Buffer is locked; skip it unless wait is 253 * requested AND pass > 0. 259 atomic_inc(&bh
->b_count
); 260 spin_unlock(&lru_list_lock
); 262 spin_lock(&lru_list_lock
); 263 atomic_dec(&bh
->b_count
); 267 spin_unlock(&lru_list_lock
); 269 /* If we are waiting for the sync to succeed, and if any dirty 270 * blocks were written, then repeat; on the second pass, only 271 * wait for buffers being written (do not pass to write any 272 * more buffers on the second pass). 274 }while(wait
&& retry
&& ++pass
<=2); 278 voidsync_dev(kdev_t dev
) 283 /* sync all the dirty buffers out to disk only _after_ all the 284 high level layers finished generated buffer dirty data 285 (or we'll return with some buffer still dirty on the blockdevice 286 so breaking the semantics of this call) */ 289 * FIXME(eric) we need to sync the physical devices here. 290 * This is because some (scsi) controllers have huge amounts of 291 * cache onboard (hundreds of Mb), and we need to instruct 292 * them to commit all of the dirty memory to disk, and we should 293 * not return until this has happened. 295 * This would need to get implemented by going through the assorted 296 * layers so that each block major number can be synced, and this 297 * would call down into the upper and mid-layer scsi. 301 intfsync_dev(kdev_t dev
) 311 returnsync_buffers(dev
,1); 314 asmlinkage
longsys_sync(void) 321 * filp may be NULL if called via the msync of a vma. 324 intfile_fsync(struct file
*filp
,struct dentry
*dentry
,int datasync
) 326 struct inode
* inode
= dentry
->d_inode
; 327 struct super_block
* sb
; 332 /* sync the inode to buffers */ 333 write_inode_now(inode
,0); 335 /* sync the superblock to buffers */ 338 if(sb
->s_op
&& sb
->s_op
->write_super
) 339 sb
->s_op
->write_super(sb
); 341 /* .. finally sync the buffers to disk */ 343 ret
=sync_buffers(dev
,1); 348 asmlinkage
longsys_fsync(unsigned int fd
) 351 struct dentry
* dentry
; 352 struct inode
* inode
; 360 dentry
= file
->f_dentry
; 361 inode
= dentry
->d_inode
; 364 if(!file
->f_op
|| !file
->f_op
->fsync
) 367 /* We need to protect against concurrent writers.. */ 369 err
= file
->f_op
->fsync(file
, dentry
,0); 378 asmlinkage
longsys_fdatasync(unsigned int fd
) 381 struct dentry
* dentry
; 382 struct inode
* inode
; 390 dentry
= file
->f_dentry
; 391 inode
= dentry
->d_inode
; 394 if(!file
->f_op
|| !file
->f_op
->fsync
) 398 err
= file
->f_op
->fsync(file
, dentry
,1); 407 /* After several hours of tedious analysis, the following hash 408 * function won. Do not mess with it... -DaveM 410 #define _hashfn(dev,block) \ 411 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ 412 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12)))) 413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)] 415 static __inline__
void__hash_link(struct buffer_head
*bh
,struct buffer_head
**head
) 417 if((bh
->b_next
= *head
) != NULL
) 418 bh
->b_next
->b_pprev
= &bh
->b_next
; 423 static __inline__
void__hash_unlink(struct buffer_head
*bh
) 427 bh
->b_next
->b_pprev
= bh
->b_pprev
; 428 *(bh
->b_pprev
) = bh
->b_next
; 433 static void__insert_into_lru_list(struct buffer_head
* bh
,int blist
) 435 struct buffer_head
**bhp
= &lru_list
[blist
]; 439 bh
->b_prev_free
= bh
; 441 bh
->b_next_free
= *bhp
; 442 bh
->b_prev_free
= (*bhp
)->b_prev_free
; 443 (*bhp
)->b_prev_free
->b_next_free
= bh
; 444 (*bhp
)->b_prev_free
= bh
; 445 nr_buffers_type
[blist
]++; 446 size_buffers_type
[blist
] += bh
->b_size
; 449 static void__remove_from_lru_list(struct buffer_head
* bh
,int blist
) 451 if(bh
->b_prev_free
|| bh
->b_next_free
) { 452 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
; 453 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
; 454 if(lru_list
[blist
] == bh
) 455 lru_list
[blist
] = bh
->b_next_free
; 456 if(lru_list
[blist
] == bh
) 457 lru_list
[blist
] = NULL
; 458 bh
->b_next_free
= bh
->b_prev_free
= NULL
; 459 nr_buffers_type
[blist
]--; 460 size_buffers_type
[blist
] -= bh
->b_size
; 464 static void__remove_from_free_list(struct buffer_head
* bh
,int index
) 466 if(bh
->b_next_free
== bh
) 467 free_list
[index
].list
= NULL
; 469 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
; 470 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
; 471 if(free_list
[index
].list
== bh
) 472 free_list
[index
].list
= bh
->b_next_free
; 474 bh
->b_next_free
= bh
->b_prev_free
= NULL
; 477 /* must be called with both the hash_table_lock and the lru_list_lock 479 static void__remove_from_queues(struct buffer_head
*bh
) 482 __remove_from_lru_list(bh
, bh
->b_list
); 485 static void__insert_into_queues(struct buffer_head
*bh
) 487 struct buffer_head
**head
= &hash(bh
->b_dev
, bh
->b_blocknr
); 489 __hash_link(bh
, head
); 490 __insert_into_lru_list(bh
, bh
->b_list
); 493 /* This function must only run if there are no other 494 * references _anywhere_ to this buffer head. 496 static voidput_last_free(struct buffer_head
* bh
) 498 struct bh_free_head
*head
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)]; 499 struct buffer_head
**bhp
= &head
->list
; 503 spin_lock(&head
->lock
); 507 bh
->b_prev_free
= bh
; 509 bh
->b_next_free
= *bhp
; 510 bh
->b_prev_free
= (*bhp
)->b_prev_free
; 511 (*bhp
)->b_prev_free
->b_next_free
= bh
; 512 (*bhp
)->b_prev_free
= bh
; 513 spin_unlock(&head
->lock
); 517 * Why like this, I hear you say... The reason is race-conditions. 518 * As we don't lock buffers (unless we are reading them, that is), 519 * something might happen to it while we sleep (ie a read-error 520 * will force it bad). This shouldn't really happen currently, but 523 staticinlinestruct buffer_head
*__get_hash_table(kdev_t dev
,int block
,int size
) 525 struct buffer_head
*bh
=hash(dev
, block
); 527 for(; bh
; bh
= bh
->b_next
) 528 if(bh
->b_blocknr
== block
&& 529 bh
->b_size
== size
&& 533 atomic_inc(&bh
->b_count
); 538 struct buffer_head
*get_hash_table(kdev_t dev
,int block
,int size
) 540 struct buffer_head
*bh
; 542 read_lock(&hash_table_lock
); 543 bh
=__get_hash_table(dev
, block
, size
); 544 read_unlock(&hash_table_lock
); 549 unsigned intget_hardblocksize(kdev_t dev
) 552 * Get the hard sector size for the given device. If we don't know 553 * what it is, return 0. 555 if(hardsect_size
[MAJOR(dev
)] != NULL
) { 556 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)]; 562 * We don't know what the hardware sector size for this device is. 563 * Return 0 indicating that we don't know. 568 /* If invalidate_buffers() will trash dirty buffers, it means some kind 569 of fs corruption is going on. Trashing dirty data always imply losing 570 information that was supposed to be just stored on the physical layer 573 Thus invalidate_buffers in general usage is not allwowed to trash dirty 574 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. 576 NOTE: In the case where the user removed a removable-media-disk even if 577 there's still dirty data not synced on disk (due a bug in the device driver 578 or due an error of the user), by not destroying the dirty buffers we could 579 generate corruption also on the next media inserted, thus a parameter is 580 necessary to handle this case in the most safe way possible (trying 581 to not corrupt also the new disk inserted with the data belonging to 582 the old now corrupted disk). Also for the ramdisk the natural thing 583 to do in order to release the ramdisk memory is to destroy dirty buffers. 585 These are two special cases. Normal usage imply the device driver 586 to issue a sync on the device (without waiting I/O completation) and 587 then an invalidate_buffers call that doesn't trashes dirty buffers. */ 588 void__invalidate_buffers(kdev_t dev
,int destroy_dirty_buffers
) 591 struct buffer_head
* bh
, * bh_next
; 595 spin_lock(&lru_list_lock
); 596 for(nlist
=0; nlist
< NR_LIST
; nlist
++) { 597 bh
= lru_list
[nlist
]; 600 for(i
= nr_buffers_type
[nlist
]; i
>0; bh
= bh_next
, i
--) { 601 bh_next
= bh
->b_next_free
; 604 if(buffer_locked(bh
)) { 605 atomic_inc(&bh
->b_count
); 606 spin_unlock(&lru_list_lock
); 609 spin_lock(&lru_list_lock
); 610 atomic_dec(&bh
->b_count
); 613 write_lock(&hash_table_lock
); 614 if(!atomic_read(&bh
->b_count
) && 615 (destroy_dirty_buffers
|| !buffer_dirty(bh
))) { 616 __remove_from_queues(bh
); 619 write_unlock(&hash_table_lock
); 625 spin_unlock(&lru_list_lock
); 630 voidset_blocksize(kdev_t dev
,int size
) 632 externint*blksize_size
[]; 634 struct buffer_head
* bh
, * bh_next
; 636 if(!blksize_size
[MAJOR(dev
)]) 639 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 640 if(size
> PAGE_SIZE
|| size
<512|| (size
& (size
-1))) 641 panic("Invalid blocksize passed to set_blocksize"); 643 if(blksize_size
[MAJOR(dev
)][MINOR(dev
)] ==0&& size
== BLOCK_SIZE
) { 644 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
; 647 if(blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
) 650 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
; 654 spin_lock(&lru_list_lock
); 655 for(nlist
=0; nlist
< NR_LIST
; nlist
++) { 656 bh
= lru_list
[nlist
]; 659 for(i
= nr_buffers_type
[nlist
]; i
>0; bh
= bh_next
, i
--) { 660 bh_next
= bh
->b_next_free
; 661 if(bh
->b_dev
!= dev
|| bh
->b_size
== size
) 663 if(buffer_locked(bh
)) { 664 atomic_inc(&bh
->b_count
); 665 spin_unlock(&lru_list_lock
); 668 spin_lock(&lru_list_lock
); 669 atomic_dec(&bh
->b_count
); 672 write_lock(&hash_table_lock
); 673 if(!atomic_read(&bh
->b_count
)) { 676 "set_blocksize: dev %s buffer_dirty %lu size %hu\n", 677 kdevname(dev
), bh
->b_blocknr
, bh
->b_size
); 678 __remove_from_queues(bh
); 681 if(atomic_set_buffer_clean(bh
)) 683 clear_bit(BH_Uptodate
, &bh
->b_state
); 686 "b_count %d, dev %s, block %lu, from %p\n", 687 atomic_read(&bh
->b_count
),bdevname(bh
->b_dev
), 688 bh
->b_blocknr
,__builtin_return_address(0)); 690 write_unlock(&hash_table_lock
); 696 spin_unlock(&lru_list_lock
); 702 * We used to try various strange things. Let's not. 704 static voidrefill_freelist(int size
) 706 if(!grow_buffers(size
)) { 708 current
->policy
|= SCHED_YIELD
; 713 voidinit_buffer(struct buffer_head
*bh
, bh_end_io_t
*handler
,void*private) 715 bh
->b_list
= BUF_CLEAN
; 716 bh
->b_end_io
= handler
; 717 bh
->b_private
=private; 720 static voidend_buffer_io_sync(struct buffer_head
*bh
,int uptodate
) 722 mark_buffer_uptodate(bh
, uptodate
); 726 static voidend_buffer_io_bad(struct buffer_head
*bh
,int uptodate
) 728 mark_buffer_uptodate(bh
, uptodate
); 733 static voidend_buffer_io_async(struct buffer_head
* bh
,int uptodate
) 735 static spinlock_t page_uptodate_lock
= SPIN_LOCK_UNLOCKED
; 737 struct buffer_head
*tmp
; 740 mark_buffer_uptodate(bh
, uptodate
); 742 /* This is a temporary buffer used for page I/O. */ 749 * Be _very_ careful from here on. Bad things can happen if 750 * two buffer heads end IO at almost the same time and both 751 * decide that the page is now completely done. 753 * Async buffer_heads are here only as labels for IO, and get 754 * thrown away once the IO for this page is complete. IO is 755 * deemed complete once all buffers have been visited 756 * (b_count==0) and are now unlocked. We must make sure that 757 * only the _last_ buffer that decrements its count is the one 758 * that unlock the page.. 760 spin_lock_irqsave(&page_uptodate_lock
, flags
); 762 atomic_dec(&bh
->b_count
); 763 tmp
= bh
->b_this_page
; 765 if(tmp
->b_end_io
== end_buffer_io_async
&&buffer_locked(tmp
)) 767 tmp
= tmp
->b_this_page
; 770 /* OK, the async IO on this page is complete. */ 771 spin_unlock_irqrestore(&page_uptodate_lock
, flags
); 774 * if none of the buffers had errors then we can set the 778 SetPageUptodate(page
); 781 * Run the hooks that have to be done when a page I/O has completed. 783 if(PageTestandClearDecrAfter(page
)) 784 atomic_dec(&nr_async_pages
); 791 spin_unlock_irqrestore(&page_uptodate_lock
, flags
); 796 * Ok, this is getblk, and it isn't very clear, again to hinder 797 * race-conditions. Most of the code is seldom used, (ie repeating), 798 * so it should be much more efficient than it looks. 800 * The algorithm is changed: hopefully better, and an elusive bug removed. 802 * 14.02.92: changed it to sync dirty buffers a bit: better performance 803 * when the filesystem starts to get full of dirty blocks (I hope). 805 struct buffer_head
*getblk(kdev_t dev
,int block
,int size
) 807 struct buffer_head
* bh
; 811 spin_lock(&lru_list_lock
); 812 write_lock(&hash_table_lock
); 813 bh
=__get_hash_table(dev
, block
, size
); 817 isize
=BUFSIZE_INDEX(size
); 818 spin_lock(&free_list
[isize
].lock
); 819 bh
= free_list
[isize
].list
; 821 __remove_from_free_list(bh
, isize
); 822 atomic_set(&bh
->b_count
,1); 824 spin_unlock(&free_list
[isize
].lock
); 827 * OK, FINALLY we know that this buffer is the only one of 828 * its kind, we hold a reference (b_count>0), it is unlocked, 832 init_buffer(bh
, end_buffer_io_sync
, NULL
); 834 bh
->b_blocknr
= block
; 835 bh
->b_state
=1<< BH_Mapped
; 837 /* Insert the buffer into the regular lists */ 838 __insert_into_queues(bh
); 840 write_unlock(&hash_table_lock
); 841 spin_unlock(&lru_list_lock
); 847 * If we block while refilling the free list, somebody may 848 * create the buffer first ... search the hashes again. 850 write_unlock(&hash_table_lock
); 851 spin_unlock(&lru_list_lock
); 852 refill_freelist(size
); 856 /* -1 -> no need to flush 858 1 -> sync flush (wait for I/O completation) */ 859 static intbalance_dirty_state(kdev_t dev
) 861 unsigned long dirty
, tot
, hard_dirty_limit
, soft_dirty_limit
; 863 dirty
= size_buffers_type
[BUF_DIRTY
] >> PAGE_SHIFT
; 864 tot
=nr_free_buffer_pages(); 865 tot
-= size_buffers_type
[BUF_PROTECTED
] >> PAGE_SHIFT
; 868 soft_dirty_limit
= tot
* bdf_prm
.b_un
.nfract
; 869 hard_dirty_limit
= soft_dirty_limit
*2; 871 if(dirty
> soft_dirty_limit
) { 872 if(dirty
> hard_dirty_limit
) 880 * if a new dirty buffer is created we need to balance bdflush. 882 * in the future we might want to make bdflush aware of different 883 * pressures on different devices - thus the (currently unused) 886 voidbalance_dirty(kdev_t dev
) 888 int state
=balance_dirty_state(dev
); 892 wakeup_bdflush(state
); 895 static __inline__
void__mark_dirty(struct buffer_head
*bh
,int flag
) 897 bh
->b_flushtime
= jiffies
+ bdf_prm
.b_un
.age_buffer
; 901 /* atomic version, the user must call balance_dirty() by hand 902 as soon as it become possible to block */ 903 void__mark_buffer_dirty(struct buffer_head
*bh
,int flag
) 905 if(!atomic_set_buffer_dirty(bh
)) 906 __mark_dirty(bh
, flag
); 909 voidmark_buffer_dirty(struct buffer_head
*bh
,int flag
) 911 __mark_buffer_dirty(bh
, flag
); 912 balance_dirty(bh
->b_dev
); 916 * A buffer may need to be moved from one buffer list to another 917 * (e.g. in case it is not shared any more). Handle this. 919 static void__refile_buffer(struct buffer_head
*bh
) 921 int dispose
= BUF_CLEAN
; 922 if(buffer_locked(bh
)) 923 dispose
= BUF_LOCKED
; 926 if(buffer_protected(bh
)) 927 dispose
= BUF_PROTECTED
; 928 if(dispose
!= bh
->b_list
) { 929 __remove_from_lru_list(bh
, bh
->b_list
); 930 bh
->b_list
= dispose
; 931 __insert_into_lru_list(bh
, dispose
); 935 voidrefile_buffer(struct buffer_head
*bh
) 937 spin_lock(&lru_list_lock
); 939 spin_unlock(&lru_list_lock
); 943 * Release a buffer head 945 void__brelse(struct buffer_head
* buf
) 947 if(atomic_read(&buf
->b_count
)) { 948 atomic_dec(&buf
->b_count
); 951 printk("VFS: brelse: Trying to free free buffer\n"); 955 * bforget() is like brelse(), except it puts the buffer on the 956 * free list if it can.. We can NOT free the buffer if: 957 * - there are other users of it 958 * - it is locked and thus can have active IO 960 void__bforget(struct buffer_head
* buf
) 962 /* grab the lru lock here to block bdflush. */ 963 spin_lock(&lru_list_lock
); 964 write_lock(&hash_table_lock
); 965 if(!atomic_dec_and_test(&buf
->b_count
) ||buffer_locked(buf
)) 968 write_unlock(&hash_table_lock
); 969 __remove_from_lru_list(buf
, buf
->b_list
); 970 spin_unlock(&lru_list_lock
); 975 write_unlock(&hash_table_lock
); 976 spin_unlock(&lru_list_lock
); 980 * bread() reads a specified block and returns the buffer that contains 981 * it. It returns NULL if the block was unreadable. 983 struct buffer_head
*bread(kdev_t dev
,int block
,int size
) 985 struct buffer_head
* bh
; 987 bh
=getblk(dev
, block
, size
); 988 if(buffer_uptodate(bh
)) 990 ll_rw_block(READ
,1, &bh
); 992 if(buffer_uptodate(bh
)) 999 * Ok, breada can be used as bread, but additionally to mark other 1000 * blocks for reading as well. End the argument list with a negative 1006 struct buffer_head
*breada(kdev_t dev
,int block
,int bufsize
, 1007 unsigned int pos
,unsigned int filesize
) 1009 struct buffer_head
* bhlist
[NBUF
]; 1010 unsigned int blocks
; 1011 struct buffer_head
* bh
; 1021 bh
=getblk(dev
, block
, bufsize
); 1022 index
=BUFSIZE_INDEX(bh
->b_size
); 1024 if(buffer_uptodate(bh
)) 1026 elsell_rw_block(READ
,1, &bh
); 1028 blocks
= (filesize
- pos
) >> (9+index
); 1030 if(blocks
< (read_ahead
[MAJOR(dev
)] >> index
)) 1031 blocks
= read_ahead
[MAJOR(dev
)] >> index
; 1035 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */ 1039 for(i
=1; i
<blocks
; i
++) { 1040 bh
=getblk(dev
,block
+i
,bufsize
); 1041 if(buffer_uptodate(bh
)) { 1045 else bhlist
[j
++] = bh
; 1048 /* Request the read for these buffers, and then release them. */ 1050 ll_rw_block(READA
, (j
-1), bhlist
+1); 1054 /* Wait for this buffer, and then continue on. */ 1057 if(buffer_uptodate(bh
)) 1064 * Note: the caller should wake up the buffer_wait list if needed. 1066 static __inline__
void__put_unused_buffer_head(struct buffer_head
* bh
) 1068 if(nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) { 1069 kmem_cache_free(bh_cachep
, bh
); 1072 init_waitqueue_head(&bh
->b_wait
); 1073 nr_unused_buffer_heads
++; 1074 bh
->b_next_free
= unused_list
; 1075 bh
->b_this_page
= NULL
; 1081 * Reserve NR_RESERVED buffer heads for async IO requests to avoid 1082 * no-buffer-head deadlock. Return NULL on failure; waiting for 1083 * buffer heads is now handled in create_buffers(). 1085 static struct buffer_head
*get_unused_buffer_head(int async
) 1087 struct buffer_head
* bh
; 1089 spin_lock(&unused_list_lock
); 1090 if(nr_unused_buffer_heads
> NR_RESERVED
) { 1092 unused_list
= bh
->b_next_free
; 1093 nr_unused_buffer_heads
--; 1094 spin_unlock(&unused_list_lock
); 1097 spin_unlock(&unused_list_lock
); 1099 /* This is critical. We can't swap out pages to get 1100 * more buffer heads, because the swap-out may need 1101 * more buffer-heads itself. Thus SLAB_BUFFER. 1103 if((bh
=kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) { 1104 memset(bh
,0,sizeof(*bh
)); 1105 init_waitqueue_head(&bh
->b_wait
); 1110 * If we need an async buffer, use the reserved buffer heads. 1113 spin_lock(&unused_list_lock
); 1116 unused_list
= bh
->b_next_free
; 1117 nr_unused_buffer_heads
--; 1118 spin_unlock(&unused_list_lock
); 1121 spin_unlock(&unused_list_lock
); 1125 * (Pending further analysis ...) 1126 * Ordinary (non-async) requests can use a different memory priority 1127 * to free up pages. Any swapping thus generated will use async 1131 (bh
=kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) { 1132 memset(bh
,0,sizeof(*bh
)); 1133 init_waitqueue_head(&bh
->b_wait
); 1141 voidset_bh_page(struct buffer_head
*bh
,struct page
*page
,unsigned long offset
) 1144 if(offset
>= PAGE_SIZE
) 1146 if(PageHighMem(page
)) 1148 * This catches illegal uses and preserves the offset: 1150 bh
->b_data
= (char*)(0+ offset
); 1152 bh
->b_data
=page_address(page
) + offset
; 1156 * Create the appropriate buffers when given a page for data area and 1157 * the size of each buffer.. Use the bh->b_this_page linked list to 1158 * follow the buffers created. Return NULL if unable to create more 1160 * The async flag is used to differentiate async IO (paging, swapping) 1161 * from ordinary buffer allocations, and only async requests are allowed 1162 * to sleep waiting for buffer heads. 1164 static struct buffer_head
*create_buffers(struct page
* page
,unsigned long size
,int async
) 1166 struct buffer_head
*bh
, *head
; 1172 while((offset
-= size
) >=0) { 1173 bh
=get_unused_buffer_head(async
); 1177 bh
->b_dev
= B_FREE
;/* Flag as unused */ 1178 bh
->b_this_page
= head
; 1182 bh
->b_next_free
= NULL
; 1184 atomic_set(&bh
->b_count
,0); 1187 set_bh_page(bh
, page
, offset
); 1189 bh
->b_list
= BUF_CLEAN
; 1190 bh
->b_end_io
= end_buffer_io_bad
; 1194 * In case anything failed, we just free everything we got. 1198 spin_lock(&unused_list_lock
); 1201 head
= head
->b_this_page
; 1202 __put_unused_buffer_head(bh
); 1204 spin_unlock(&unused_list_lock
); 1206 /* Wake up any waiters ... */ 1207 wake_up(&buffer_wait
); 1211 * Return failure for non-async IO requests. Async IO requests 1212 * are not allowed to fail, so we have to wait until buffer heads 1213 * become available. But we don't want tasks sleeping with 1214 * partially complete buffers, so all were released above. 1219 /* We're _really_ low on memory. Now we just 1220 * wait for old buffer heads to become free due to 1221 * finishing IO. Since this is an async request and 1222 * the reserve list is empty, we're sure there are 1223 * async buffer heads in use. 1225 run_task_queue(&tq_disk
); 1228 * Set our state for sleeping, then check again for buffer heads. 1229 * This ensures we won't miss a wake_up from an interrupt. 1231 wait_event(buffer_wait
, nr_unused_buffer_heads
>= MAX_BUF_PER_PAGE
); 1235 static intcreate_page_buffers(int rw
,struct page
*page
, kdev_t dev
,int b
[],int size
) 1237 struct buffer_head
*head
, *bh
, *tail
; 1240 if(!PageLocked(page
)) 1243 * Allocate async buffer heads pointing to this page, just for I/O. 1244 * They don't show up in the buffer hash table, but they *are* 1245 * registered in page->buffers. 1247 head
=create_buffers(page
, size
,1); 1253 for(bh
= head
; bh
; bh
= bh
->b_this_page
) { 1257 init_buffer(bh
, end_buffer_io_async
, NULL
); 1259 bh
->b_blocknr
= block
; 1261 set_bit(BH_Mapped
, &bh
->b_state
); 1263 tail
->b_this_page
= head
; 1264 page_cache_get(page
); 1265 page
->buffers
= head
; 1269 static voidunmap_buffer(struct buffer_head
* bh
) 1271 if(buffer_mapped(bh
)) { 1272 mark_buffer_clean(bh
); 1274 clear_bit(BH_Uptodate
, &bh
->b_state
); 1275 clear_bit(BH_Mapped
, &bh
->b_state
); 1276 clear_bit(BH_Req
, &bh
->b_state
); 1277 clear_bit(BH_New
, &bh
->b_state
); 1282 * We don't have to release all buffers here, but 1283 * we have to be sure that no dirty buffer is left 1284 * and no IO is going on (no buffer is locked), because 1285 * we have truncated the file and are going to free the 1288 intblock_flushpage(struct page
*page
,unsigned long offset
) 1290 struct buffer_head
*head
, *bh
, *next
; 1291 unsigned int curr_off
=0; 1293 if(!PageLocked(page
)) 1298 head
= page
->buffers
; 1301 unsigned int next_off
= curr_off
+ bh
->b_size
; 1302 next
= bh
->b_this_page
; 1305 * is this block fully flushed? 1307 if(offset
<= curr_off
) 1309 curr_off
= next_off
; 1314 * subtle. We release buffer-heads only if this is 1315 * the 'final' flushpage. We have invalidated the get_block 1316 * cached value unconditionally, so real IO is not 1319 * If the free doesn't work out, the buffers can be 1320 * left around - they just turn into anonymous buffers 1324 if(!try_to_free_buffers(page
,0)) { 1325 atomic_inc(&buffermem_pages
); 1333 static voidcreate_empty_buffers(struct page
*page
,struct inode
*inode
,unsigned long blocksize
) 1335 struct buffer_head
*bh
, *head
, *tail
; 1337 head
=create_buffers(page
, blocksize
,1); 1343 bh
->b_dev
= inode
->i_dev
; 1345 bh
->b_end_io
= end_buffer_io_bad
; 1347 bh
= bh
->b_this_page
; 1349 tail
->b_this_page
= head
; 1350 page
->buffers
= head
; 1351 page_cache_get(page
); 1355 * We are taking a block for data and we don't want any output from any 1356 * buffer-cache aliases starting from return from that function and 1357 * until the moment when something will explicitly mark the buffer 1358 * dirty (hopefully that will not happen until we will free that block ;-) 1359 * We don't even need to mark it not-uptodate - nobody can expect 1360 * anything from a newly allocated buffer anyway. We used to used 1361 * unmap_buffer() for such invalidation, but that was wrong. We definitely 1362 * don't want to mark the alias unmapped, for example - it would confuse 1363 * anyone who might pick it with bread() afterwards... 1366 static voidunmap_underlying_metadata(struct buffer_head
* bh
) 1368 struct buffer_head
*old_bh
; 1370 old_bh
=get_hash_table(bh
->b_dev
, bh
->b_blocknr
, bh
->b_size
); 1372 mark_buffer_clean(old_bh
); 1373 wait_on_buffer(old_bh
); 1374 clear_bit(BH_Req
, &old_bh
->b_state
); 1375 /* Here we could run brelse or bforget. We use 1376 bforget because it will try to put the buffer 1383 * block_write_full_page() is SMP-safe - currently it's still 1384 * being called with the kernel lock held, but the code is ready. 1386 static int__block_write_full_page(struct inode
*inode
,struct page
*page
, get_block_t
*get_block
) 1388 int err
, i
, need_balance_dirty
=0; 1389 unsigned long block
; 1390 struct buffer_head
*bh
, *head
; 1392 if(!PageLocked(page
)) 1396 create_empty_buffers(page
, inode
, inode
->i_sb
->s_blocksize
); 1397 head
= page
->buffers
; 1399 block
= page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_sb
->s_blocksize_bits
); 1405 * If the buffer isn't up-to-date, we can't be sure 1406 * that the buffer has been initialized with the proper 1407 * block number information etc.. 1409 * Leave it to the low-level FS to make all those 1410 * decisions (block #0 may actually be a valid block) 1412 bh
->b_end_io
= end_buffer_io_sync
; 1413 if(!buffer_mapped(bh
)) { 1414 err
=get_block(inode
, block
, bh
,1); 1418 unmap_underlying_metadata(bh
); 1420 set_bit(BH_Uptodate
, &bh
->b_state
); 1421 if(!atomic_set_buffer_dirty(bh
)) { 1423 need_balance_dirty
=1; 1426 bh
= bh
->b_this_page
; 1430 if(need_balance_dirty
) 1431 balance_dirty(bh
->b_dev
); 1433 SetPageUptodate(page
); 1436 ClearPageUptodate(page
); 1440 static int__block_prepare_write(struct inode
*inode
,struct page
*page
, 1441 unsigned from
,unsigned to
, get_block_t
*get_block
) 1443 unsigned block_start
, block_end
; 1444 unsigned long block
; 1446 unsigned blocksize
, bbits
; 1447 struct buffer_head
*bh
, *head
, *wait
[2], **wait_bh
=wait
; 1448 char*kaddr
= (char*)kmap(page
); 1450 blocksize
= inode
->i_sb
->s_blocksize
; 1452 create_empty_buffers(page
, inode
, blocksize
); 1453 head
= page
->buffers
; 1455 bbits
= inode
->i_sb
->s_blocksize_bits
; 1456 block
= page
->index
<< (PAGE_CACHE_SHIFT
- bbits
); 1458 for(bh
= head
, block_start
=0; bh
!= head
|| !block_start
; 1459 block
++, block_start
=block_end
, bh
= bh
->b_this_page
) { 1462 block_end
= block_start
+blocksize
; 1463 if(block_end
<= from
) 1465 if(block_start
>= to
) 1467 bh
->b_end_io
= end_buffer_io_sync
; 1468 if(!buffer_mapped(bh
)) { 1469 err
=get_block(inode
, block
, bh
,1); 1472 if(buffer_new(bh
)) { 1473 unmap_underlying_metadata(bh
); 1475 memset(kaddr
+to
,0, block_end
-to
); 1476 if(block_start
< from
) 1477 memset(kaddr
+block_start
,0, from
-block_start
); 1478 if(block_end
> to
|| block_start
< from
) 1479 flush_dcache_page(page
); 1483 if(!buffer_uptodate(bh
) && 1484 (block_start
< from
|| block_end
> to
)) { 1485 ll_rw_block(READ
,1, &bh
); 1490 * If we issued read requests - let them complete. 1492 while(wait_bh
> wait
) { 1493 wait_on_buffer(*--wait_bh
); 1495 if(!buffer_uptodate(*wait_bh
)) 1503 static int__block_commit_write(struct inode
*inode
,struct page
*page
, 1504 unsigned from
,unsigned to
) 1506 unsigned block_start
, block_end
; 1507 int partial
=0, need_balance_dirty
=0; 1509 struct buffer_head
*bh
, *head
; 1511 blocksize
= inode
->i_sb
->s_blocksize
; 1513 for(bh
= head
= page
->buffers
, block_start
=0; 1514 bh
!= head
|| !block_start
; 1515 block_start
=block_end
, bh
= bh
->b_this_page
) { 1516 block_end
= block_start
+ blocksize
; 1517 if(block_end
<= from
|| block_start
>= to
) { 1518 if(!buffer_uptodate(bh
)) 1521 set_bit(BH_Uptodate
, &bh
->b_state
); 1522 if(!atomic_set_buffer_dirty(bh
)) { 1524 need_balance_dirty
=1; 1529 if(need_balance_dirty
) 1530 balance_dirty(bh
->b_dev
); 1532 * is this a partial write that happened to make all buffers 1533 * uptodate then we can optimize away a bogus readpage() for 1534 * the next read(). Here we 'discover' wether the page went 1535 * uptodate as a result of this (potentially partial) write. 1538 SetPageUptodate(page
); 1543 * Generic "read page" function for block devices that have the normal 1544 * get_block functionality. This is most of the block device filesystems. 1545 * Reads the page asynchronously --- the unlock_buffer() and 1546 * mark_buffer_uptodate() functions propagate buffer state into the 1547 * page struct once IO has completed. 1549 intblock_read_full_page(struct page
*page
, get_block_t
*get_block
) 1551 struct inode
*inode
= (struct inode
*)page
->mapping
->host
; 1552 unsigned long iblock
, lblock
; 1553 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
]; 1554 unsigned int blocksize
, blocks
; 1555 unsigned long kaddr
=0; 1558 if(!PageLocked(page
)) 1560 blocksize
= inode
->i_sb
->s_blocksize
; 1562 create_empty_buffers(page
, inode
, blocksize
); 1563 head
= page
->buffers
; 1565 blocks
= PAGE_CACHE_SIZE
>> inode
->i_sb
->s_blocksize_bits
; 1566 iblock
= page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_sb
->s_blocksize_bits
); 1567 lblock
= (inode
->i_size
+blocksize
-1) >> inode
->i_sb
->s_blocksize_bits
; 1573 if(buffer_uptodate(bh
)) 1576 if(!buffer_mapped(bh
)) { 1578 get_block(inode
, iblock
, bh
,0); 1579 if(!buffer_mapped(bh
)) { 1582 memset((char*)(kaddr
+ i
*blocksize
),0, blocksize
); 1583 flush_dcache_page(page
); 1584 set_bit(BH_Uptodate
, &bh
->b_state
); 1589 init_buffer(bh
, end_buffer_io_async
, NULL
); 1590 atomic_inc(&bh
->b_count
); 1593 }while(i
++, iblock
++, (bh
= bh
->b_this_page
) != head
); 1596 if(Page_Uptodate(page
)) 1598 ll_rw_block(READ
, nr
, arr
); 1601 * all buffers are uptodate - we can set the page 1604 SetPageUptodate(page
); 1613 * For moronic filesystems that do not allow holes in file. 1614 * We may have to extend the file. 1617 intcont_prepare_write(struct page
*page
,unsigned offset
,unsigned to
, get_block_t
*get_block
,unsigned long*bytes
) 1619 struct address_space
*mapping
= page
->mapping
; 1620 struct inode
*inode
= (struct inode
*)mapping
->host
; 1621 struct page
*new_page
; 1622 unsigned long pgpos
; 1625 unsigned blocksize
= inode
->i_sb
->s_blocksize
; 1628 while(page
->index
> (pgpos
= *bytes
>>PAGE_CACHE_SHIFT
)) { 1630 new_page
=grab_cache_page(mapping
, pgpos
); 1633 /* we might sleep */ 1634 if(*bytes
>>PAGE_CACHE_SHIFT
!= pgpos
) { 1635 UnlockPage(new_page
); 1636 page_cache_release(new_page
); 1639 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
; 1640 if(zerofrom
& (blocksize
-1)) { 1641 *bytes
|= (blocksize
-1); 1644 status
=__block_prepare_write(inode
, new_page
, zerofrom
, 1645 PAGE_CACHE_SIZE
, get_block
); 1648 kaddr
=page_address(new_page
); 1649 memset(kaddr
+zerofrom
,0, PAGE_CACHE_SIZE
-zerofrom
); 1650 flush_dcache_page(new_page
); 1651 __block_commit_write(inode
, new_page
, zerofrom
, PAGE_CACHE_SIZE
); 1653 UnlockPage(new_page
); 1654 page_cache_release(new_page
); 1657 if(page
->index
< pgpos
) { 1658 /* completely inside the area */ 1661 /* page covers the boundary, find the boundary offset */ 1662 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
; 1664 /* if we will expand the thing last block will be filled */ 1665 if(to
> zerofrom
&& (zerofrom
& (blocksize
-1))) { 1666 *bytes
|= (blocksize
-1); 1670 /* starting below the boundary? Nothing to zero out */ 1671 if(offset
<= zerofrom
) 1674 status
=__block_prepare_write(inode
, page
, zerofrom
, to
, get_block
); 1677 kaddr
=page_address(page
); 1678 if(zerofrom
< offset
) { 1679 memset(kaddr
+zerofrom
,0, offset
-zerofrom
); 1680 flush_dcache_page(page
); 1681 __block_commit_write(inode
, page
, zerofrom
, offset
); 1685 ClearPageUptodate(page
); 1690 ClearPageUptodate(new_page
); 1692 UnlockPage(new_page
); 1693 page_cache_release(new_page
); 1698 intblock_prepare_write(struct page
*page
,unsigned from
,unsigned to
, 1699 get_block_t
*get_block
) 1701 struct inode
*inode
= (struct inode
*)page
->mapping
->host
; 1702 int err
=__block_prepare_write(inode
, page
, from
, to
, get_block
); 1704 ClearPageUptodate(page
); 1710 intgeneric_commit_write(struct file
*file
,struct page
*page
, 1711 unsigned from
,unsigned to
) 1713 struct inode
*inode
= (struct inode
*)page
->mapping
->host
; 1714 loff_t pos
= ((loff_t
)page
->index
<< PAGE_CACHE_SHIFT
) + to
; 1715 __block_commit_write(inode
,page
,from
,to
); 1717 if(pos
> inode
->i_size
) { 1718 inode
->i_size
= pos
; 1719 mark_inode_dirty(inode
); 1724 intblock_write_full_page(struct page
*page
, get_block_t
*get_block
) 1726 struct inode
*inode
= (struct inode
*)page
->mapping
->host
; 1727 unsigned long end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
; 1732 if(page
->index
< end_index
) 1733 return__block_write_full_page(inode
, page
, get_block
); 1735 /* things got complicated... */ 1736 offset
= inode
->i_size
& (PAGE_CACHE_SIZE
-1); 1737 /* OK, are we completely out? */ 1738 if(page
->index
>= end_index
+1|| !offset
) 1740 /* Sigh... will have to work, then... */ 1741 err
=__block_prepare_write(inode
, page
,0, offset
, get_block
); 1743 memset(page_address(page
) + offset
,0, PAGE_CACHE_SIZE
- offset
); 1744 flush_dcache_page(page
); 1745 __block_commit_write(inode
,page
,0,offset
); 1750 ClearPageUptodate(page
); 1754 intgeneric_block_bmap(struct address_space
*mapping
,long block
, get_block_t
*get_block
) 1756 struct buffer_head tmp
; 1757 struct inode
*inode
= (struct inode
*)mapping
->host
; 1760 get_block(inode
, block
, &tmp
,0); 1761 return tmp
.b_blocknr
; 1765 * IO completion routine for a buffer_head being used for kiobuf IO: we 1766 * can't dispatch the kiobuf callback until io_count reaches 0. 1769 static voidend_buffer_io_kiobuf(struct buffer_head
*bh
,int uptodate
) 1771 struct kiobuf
*kiobuf
; 1773 mark_buffer_uptodate(bh
, uptodate
); 1775 kiobuf
= bh
->b_private
; 1777 end_kio_request(kiobuf
, uptodate
); 1782 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait 1783 * for them to complete. Clean up the buffer_heads afterwards. 1786 static intwait_kio(int rw
,int nr
,struct buffer_head
*bh
[],int size
) 1790 struct buffer_head
*tmp
; 1794 spin_lock(&unused_list_lock
); 1796 for(i
= nr
; --i
>=0; ) { 1799 if(buffer_locked(tmp
)) { 1800 spin_unlock(&unused_list_lock
); 1801 wait_on_buffer(tmp
); 1802 spin_lock(&unused_list_lock
); 1805 if(!buffer_uptodate(tmp
)) { 1806 /* We are traversing bh'es in reverse order so 1807 clearing iosize on error calculates the 1808 amount of IO before the first error. */ 1811 __put_unused_buffer_head(tmp
); 1814 spin_unlock(&unused_list_lock
); 1820 * Start I/O on a physical range of kernel memory, defined by a vector 1821 * of kiobuf structs (much like a user-space iovec list). 1823 * The kiobuf must already be locked for IO. IO is submitted 1824 * asynchronously: you need to check page->locked, page->uptodate, and 1825 * maybe wait on page->wait. 1827 * It is up to the caller to make sure that there are enough blocks 1828 * passed in to completely map the iobufs to disk. 1831 intbrw_kiovec(int rw
,int nr
,struct kiobuf
*iovec
[], 1832 kdev_t dev
,unsigned long b
[],int size
) 1842 int sectors
= size
>>9; 1843 unsigned long blocknr
; 1844 struct kiobuf
* iobuf
= NULL
; 1846 struct buffer_head
*tmp
, *bh
[KIO_MAX_SECTORS
]; 1852 * First, do some alignment and validity checks 1854 for(i
=0; i
< nr
; i
++) { 1856 if((iobuf
->offset
& (size
-1)) || 1857 (iobuf
->length
& (size
-1))) 1859 if(!iobuf
->nr_pages
) 1860 panic("brw_kiovec: iobuf not initialised"); 1864 * OK to walk down the iovec doing page IO on each page we find. 1866 bufind
= bhind
= transferred
= err
=0; 1867 for(i
=0; i
< nr
; i
++) { 1869 offset
= iobuf
->offset
; 1870 length
= iobuf
->length
; 1873 for(pageind
=0; pageind
< iobuf
->nr_pages
; pageind
++) { 1874 map
= iobuf
->maplist
[pageind
]; 1881 blocknr
= b
[bufind
++]; 1882 tmp
=get_unused_buffer_head(0); 1888 tmp
->b_dev
= B_FREE
; 1890 set_bh_page(tmp
, map
, offset
); 1891 tmp
->b_this_page
= tmp
; 1893 init_buffer(tmp
, end_buffer_io_kiobuf
, iobuf
); 1894 tmp
->b_rdev
= tmp
->b_dev
= dev
; 1895 tmp
->b_blocknr
= blocknr
; 1896 tmp
->b_rsector
= blocknr
*sectors
; 1897 tmp
->b_state
= (1<< BH_Mapped
) | (1<< BH_Lock
) | (1<< BH_Req
); 1900 set_bit(BH_Uptodate
, &tmp
->b_state
); 1901 set_bit(BH_Dirty
, &tmp
->b_state
); 1908 atomic_inc(&iobuf
->io_count
); 1910 generic_make_request(rw
, tmp
); 1912 * Wait for IO if we have got too much 1914 if(bhind
>= KIO_MAX_SECTORS
) { 1915 err
=wait_kio(rw
, bhind
, bh
, size
); 1923 if(offset
>= PAGE_SIZE
) { 1927 }/* End of block loop */ 1928 }/* End of page loop */ 1929 }/* End of iovec loop */ 1931 /* Is there any IO still left to submit? */ 1933 err
=wait_kio(rw
, bhind
, bh
, size
); 1946 /* We got an error allocating the bh'es. Just free the current 1947 buffer_heads and exit. */ 1948 spin_lock(&unused_list_lock
); 1949 for(i
= bhind
; --i
>=0; ) { 1950 __put_unused_buffer_head(bh
[bhind
]); 1952 spin_unlock(&unused_list_lock
); 1957 * Start I/O on a page. 1958 * This function expects the page to be locked and may return 1959 * before I/O is complete. You then have to check page->locked, 1960 * page->uptodate, and maybe wait on page->wait. 1962 * brw_page() is SMP-safe, although it's being called with the 1963 * kernel lock held - but the code is ready. 1965 * FIXME: we need a swapper_inode->get_block function to remove 1966 * some of the bmap kludges and interface ugliness here. 1968 intbrw_page(int rw
,struct page
*page
, kdev_t dev
,int b
[],int size
) 1970 struct buffer_head
*head
, *bh
, *arr
[MAX_BUF_PER_PAGE
]; 1971 int nr
, fresh
/* temporary debugging flag */, block
; 1973 if(!PageLocked(page
)) 1974 panic("brw_page: page not locked for I/O"); 1975 // ClearPageError(page); 1977 * We pretty much rely on the page lock for this, because 1978 * create_page_buffers() might sleep. 1981 if(!page
->buffers
) { 1982 create_page_buffers(rw
, page
, dev
, b
, size
); 1988 head
= page
->buffers
; 1994 if(fresh
&& (atomic_read(&bh
->b_count
) !=0)) 1999 if(!buffer_uptodate(bh
)) { 2001 atomic_inc(&bh
->b_count
); 2004 if(!bh
->b_blocknr
) { 2007 bh
->b_blocknr
= block
; 2012 set_bit(BH_Uptodate
, &bh
->b_state
); 2013 set_bit(BH_Dirty
, &bh
->b_state
); 2015 atomic_inc(&bh
->b_count
); 2017 bh
= bh
->b_this_page
; 2019 if((rw
== READ
) && nr
) { 2020 if(Page_Uptodate(page
)) 2022 ll_rw_block(rw
, nr
, arr
); 2024 if(!nr
&& rw
== READ
) { 2025 SetPageUptodate(page
); 2028 if(nr
&& (rw
== WRITE
)) 2029 ll_rw_block(rw
, nr
, arr
); 2034 intblock_symlink(struct inode
*inode
,const char*symname
,int len
) 2036 struct address_space
*mapping
= inode
->i_mapping
; 2037 struct page
*page
=grab_cache_page(mapping
,0); 2043 err
= mapping
->a_ops
->prepare_write(NULL
, page
,0, len
-1); 2046 kaddr
=page_address(page
); 2047 memcpy(kaddr
, symname
, len
-1); 2048 mapping
->a_ops
->commit_write(NULL
, page
,0, len
-1); 2050 * Notice that we are _not_ going to block here - end of page is 2051 * unmapped, so this will only try to map the rest of page, see 2052 * that it is unmapped (typically even will not look into inode - 2053 * ->i_size will be enough for everything) and zero it out. 2054 * OTOH it's obviously correct and should make the page up-to-date. 2056 err
= mapping
->a_ops
->readpage(NULL
, page
); 2058 page_cache_release(page
); 2061 mark_inode_dirty(inode
); 2065 page_cache_release(page
); 2071 * Try to increase the number of buffers available: the size argument 2072 * is used to determine what kind of buffers we want. 2074 static intgrow_buffers(int size
) 2077 struct buffer_head
*bh
, *tmp
; 2078 struct buffer_head
* insert_point
; 2081 if((size
&511) || (size
> PAGE_SIZE
)) { 2082 printk("VFS: grow_buffers: size = %d\n",size
); 2086 page
=alloc_page(GFP_BUFFER
); 2089 bh
=create_buffers(page
, size
,0); 2091 goto no_buffer_head
; 2093 isize
=BUFSIZE_INDEX(size
); 2095 spin_lock(&free_list
[isize
].lock
); 2096 insert_point
= free_list
[isize
].list
; 2100 tmp
->b_next_free
= insert_point
->b_next_free
; 2101 tmp
->b_prev_free
= insert_point
; 2102 insert_point
->b_next_free
->b_prev_free
= tmp
; 2103 insert_point
->b_next_free
= tmp
; 2105 tmp
->b_prev_free
= tmp
; 2106 tmp
->b_next_free
= tmp
; 2109 if(tmp
->b_this_page
) 2110 tmp
= tmp
->b_this_page
; 2114 tmp
->b_this_page
= bh
; 2115 free_list
[isize
].list
= bh
; 2116 spin_unlock(&free_list
[isize
].lock
); 2119 page
->flags
&= ~(1<< PG_referenced
); 2120 lru_cache_add(page
); 2121 atomic_inc(&buffermem_pages
); 2125 page_cache_release(page
); 2131 * Sync all the buffers on one page.. 2133 * If we have old buffers that are locked, we'll 2134 * wait on them, but we won't wait on the new ones 2135 * we're writing out now. 2137 * This all is required so that we can free up memory 2141 * 0 - no wait (this does not get called - see try_to_free_buffers below) 2142 * 1 - start IO for dirty buffers 2143 * 2 - wait for completion of locked buffers 2145 static voidsync_page_buffers(struct buffer_head
*bh
,int wait
) 2147 struct buffer_head
* tmp
= bh
; 2150 struct buffer_head
*p
= tmp
; 2151 tmp
= tmp
->b_this_page
; 2152 if(buffer_locked(p
)) { 2154 __wait_on_buffer(p
); 2155 }else if(buffer_dirty(p
)) 2156 ll_rw_block(WRITE
,1, &p
); 2161 * Can the buffer be thrown out? 2163 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected)) 2164 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) 2167 * try_to_free_buffers() checks if all the buffers on this particular page 2168 * are unused, and free's the page if so. 2170 * Wake up bdflush() if this fails - if we're running low on memory due 2171 * to dirty buffers, we need to flush them out as quickly as possible. 2173 * NOTE: There are quite a number of ways that threads of control can 2174 * obtain a reference to a buffer head within a page. So we must 2175 * lock out all of these paths to cleanly toss the page. 2177 inttry_to_free_buffers(struct page
* page
,int wait
) 2179 struct buffer_head
* tmp
, * bh
= page
->buffers
; 2180 int index
=BUFSIZE_INDEX(bh
->b_size
); 2182 spin_lock(&lru_list_lock
); 2183 write_lock(&hash_table_lock
); 2184 spin_lock(&free_list
[index
].lock
); 2187 struct buffer_head
*p
= tmp
; 2189 tmp
= tmp
->b_this_page
; 2191 goto busy_buffer_page
; 2194 spin_lock(&unused_list_lock
); 2197 struct buffer_head
* p
= tmp
; 2198 tmp
= tmp
->b_this_page
; 2200 /* The buffer can be either on the regular 2201 * queues or on the free list.. 2203 if(p
->b_dev
!= B_FREE
) 2204 __remove_from_queues(p
); 2206 __remove_from_free_list(p
, index
); 2207 __put_unused_buffer_head(p
); 2209 spin_unlock(&unused_list_lock
); 2211 /* Wake up anyone waiting for buffer heads */ 2212 wake_up(&buffer_wait
); 2214 /* And free the page */ 2215 page
->buffers
= NULL
; 2216 page_cache_release(page
); 2217 spin_unlock(&free_list
[index
].lock
); 2218 write_unlock(&hash_table_lock
); 2219 spin_unlock(&lru_list_lock
); 2223 /* Uhhuh, start writeback so that we don't end up with all dirty pages */ 2224 spin_unlock(&free_list
[index
].lock
); 2225 write_unlock(&hash_table_lock
); 2226 spin_unlock(&lru_list_lock
); 2228 sync_page_buffers(bh
, wait
); 2232 /* ================== Debugging =================== */ 2234 voidshow_buffers(void) 2237 struct buffer_head
* bh
; 2238 int found
=0, locked
=0, dirty
=0, used
=0, lastused
=0; 2241 static char*buf_types
[NR_LIST
] = {"CLEAN","LOCKED","DIRTY","PROTECTED", }; 2244 printk("Buffer memory: %6dkB\n", 2245 atomic_read(&buffermem_pages
) << (PAGE_SHIFT
-10)); 2247 #ifdef CONFIG_SMP/* trylock does nothing on UP and so we could deadlock */ 2248 if(!spin_trylock(&lru_list_lock
)) 2250 for(nlist
=0; nlist
< NR_LIST
; nlist
++) { 2251 found
= locked
= dirty
= used
= lastused
=protected=0; 2252 bh
= lru_list
[nlist
]; 2257 if(buffer_locked(bh
)) 2259 if(buffer_protected(bh
)) 2261 if(buffer_dirty(bh
)) 2263 if(atomic_read(&bh
->b_count
)) 2264 used
++, lastused
= found
; 2265 bh
= bh
->b_next_free
; 2266 }while(bh
!= lru_list
[nlist
]); 2268 int tmp
= nr_buffers_type
[nlist
]; 2270 printk("%9s: BUG -> found %d, reported %d\n", 2271 buf_types
[nlist
], found
, tmp
); 2273 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), " 2274 "%d locked, %d protected, %d dirty\n", 2275 buf_types
[nlist
], found
, size_buffers_type
[nlist
]>>10, 2276 used
, lastused
, locked
,protected, dirty
); 2278 spin_unlock(&lru_list_lock
); 2282 /* ===================== Init ======================= */ 2285 * allocate the hash table and init the free list 2286 * Use gfp() for the hash table to decrease TLB misses, use 2287 * SLAB cache for buffer heads. 2289 void __init
buffer_init(unsigned long mempages
) 2292 unsigned int nr_hash
; 2294 /* The buffer cache hash table is less important these days, 2299 mempages
*=sizeof(struct buffer_head
*); 2301 for(order
=0; (1<< order
) < mempages
; order
++) 2304 /* try to allocate something until we get it or we're asking 2305 for something that is really too small */ 2310 nr_hash
= (PAGE_SIZE
<< order
) /sizeof(struct buffer_head
*); 2311 bh_hash_mask
= (nr_hash
-1); 2315 while((tmp
>>=1UL) !=0UL) 2318 hash_table
= (struct buffer_head
**) 2319 __get_free_pages(GFP_ATOMIC
, order
); 2320 }while(hash_table
== NULL
&& --order
>0); 2321 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", 2322 nr_hash
, order
, (PAGE_SIZE
<< order
)); 2325 panic("Failed to allocate buffer hash table\n"); 2327 /* Setup hash chains. */ 2328 for(i
=0; i
< nr_hash
; i
++) 2329 hash_table
[i
] = NULL
; 2331 /* Setup free lists. */ 2332 for(i
=0; i
< NR_SIZES
; i
++) { 2333 free_list
[i
].list
= NULL
; 2334 free_list
[i
].lock
= SPIN_LOCK_UNLOCKED
; 2337 /* Setup lru lists. */ 2338 for(i
=0; i
< NR_LIST
; i
++) 2344 /* ====================== bdflush support =================== */ 2346 /* This is a simple kernel daemon, whose job it is to provide a dynamic 2347 * response to dirty buffers. Once this process is activated, we write back 2348 * a limited number of buffers to the disks and then go back to sleep again. 2350 staticDECLARE_WAIT_QUEUE_HEAD(bdflush_done
); 2351 struct task_struct
*bdflush_tsk
=0; 2353 voidwakeup_bdflush(int block
) 2355 DECLARE_WAITQUEUE(wait
, current
); 2357 if(current
== bdflush_tsk
) 2361 wake_up_process(bdflush_tsk
); 2365 /* kflushd can wakeup us before we have a chance to 2366 go to sleep so we must be smart in handling 2367 this wakeup event from kflushd to avoid deadlocking in SMP 2368 (we are not holding any lock anymore in these two paths). */ 2369 __set_current_state(TASK_UNINTERRUPTIBLE
); 2370 add_wait_queue(&bdflush_done
, &wait
); 2372 wake_up_process(bdflush_tsk
); 2375 remove_wait_queue(&bdflush_done
, &wait
); 2376 __set_current_state(TASK_RUNNING
); 2379 /* This is the _only_ function that deals with flushing async writes 2381 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list 2382 as all dirty buffers lives _only_ in the DIRTY lru list. 2383 As we never browse the LOCKED and CLEAN lru lists they are infact 2384 completly useless. */ 2385 static intflush_dirty_buffers(int check_flushtime
) 2387 struct buffer_head
* bh
, *next
; 2391 spin_lock(&lru_list_lock
); 2392 bh
= lru_list
[BUF_DIRTY
]; 2395 for(i
= nr_buffers_type
[BUF_DIRTY
]; i
-- >0; bh
= next
) { 2396 next
= bh
->b_next_free
; 2398 if(!buffer_dirty(bh
)) { 2399 __refile_buffer(bh
); 2402 if(buffer_locked(bh
)) 2405 if(check_flushtime
) { 2406 /* The dirty lru list is chronologically ordered so 2407 if the current bh is not yet timed out, 2408 then also all the following bhs 2409 will be too young. */ 2410 if(time_before(jiffies
, bh
->b_flushtime
)) 2413 if(++flushed
> bdf_prm
.b_un
.ndirty
) 2417 /* OK, now we are committed to write it out. */ 2418 atomic_inc(&bh
->b_count
); 2419 spin_unlock(&lru_list_lock
); 2420 ll_rw_block(WRITE
,1, &bh
); 2421 atomic_dec(&bh
->b_count
); 2423 if(current
->need_resched
) 2428 spin_unlock(&lru_list_lock
); 2434 * Here we attempt to write back old buffers. We also try to flush inodes 2435 * and supers as well, since this function is essentially "update", and 2436 * otherwise there would be no way of ensuring that these quantities ever 2437 * get written back. Ideally, we would have a timestamp on the inodes 2438 * and superblocks so that we could write back only the old ones as well 2441 static intsync_old_buffers(void) 2448 flush_dirty_buffers(1); 2449 /* must really sync all the active I/O request to disk here */ 2450 run_task_queue(&tq_disk
); 2454 intblock_sync_page(struct page
*page
) 2456 run_task_queue(&tq_disk
); 2460 /* This is the interface to bdflush. As we get more sophisticated, we can 2461 * pass tuning parameters to this "process", to adjust how it behaves. 2462 * We would want to verify each parameter, however, to make sure that it 2465 asmlinkage
longsys_bdflush(int func
,long data
) 2467 if(!capable(CAP_SYS_ADMIN
)) 2471 /* do_exit directly and let kupdate to do its work alone. */ 2473 #if 0/* left here as it's the only example of lazy-mm-stuff used from 2474 a syscall that doesn't care about the current mm context. */ 2476 struct mm_struct
*user_mm
; 2479 * bdflush will spend all of it's time in kernel-space, 2480 * without touching user-space, so we can switch it into 2481 * 'lazy TLB mode' to reduce the cost of context-switches 2482 * to and from bdflush. 2484 user_mm
=start_lazy_tlb(); 2485 error
=sync_old_buffers(); 2486 end_lazy_tlb(user_mm
); 2491 /* Basically func 1 means read param 1, 2 means write param 1, etc */ 2493 int i
= (func
-2) >>1; 2494 if(i
>=0&& i
< N_PARAM
) { 2496 returnput_user(bdf_prm
.data
[i
], (int*)data
); 2498 if(data
>= bdflush_min
[i
] && data
<= bdflush_max
[i
]) { 2499 bdf_prm
.data
[i
] = data
; 2506 /* Having func 0 used to launch the actual bdflush and then never 2507 * return (unless explicitly killed). We return zero here to 2508 * remain semi-compatible with present update(8) programs. 2514 * This is the actual bdflush daemon itself. It used to be started from 2515 * the syscall above, but now we launch it ourselves internally with 2516 * kernel_thread(...) directly after the first thread in init/main.c 2518 intbdflush(void*sem
) 2520 struct task_struct
*tsk
= current
; 2523 * We have a bare-bones task_struct, and really should fill 2524 * in a few more things so "top" and /proc/2/{exe,root,cwd} 2525 * display semi-sane things. Not real crucial though... 2530 strcpy(tsk
->comm
,"kflushd"); 2533 /* avoid getting signals */ 2534 spin_lock_irq(&tsk
->sigmask_lock
); 2536 sigfillset(&tsk
->blocked
); 2537 recalc_sigpending(tsk
); 2538 spin_unlock_irq(&tsk
->sigmask_lock
); 2540 up((struct semaphore
*)sem
); 2543 CHECK_EMERGENCY_SYNC
2545 flushed
=flush_dirty_buffers(0); 2547 /* If wakeup_bdflush will wakeup us 2548 after our bdflush_done wakeup, then 2549 we must make sure to not sleep 2550 in schedule_timeout otherwise 2551 wakeup_bdflush may wait for our 2552 bdflush_done wakeup that would never arrive 2553 (as we would be sleeping) and so it would 2555 __set_current_state(TASK_INTERRUPTIBLE
); 2556 wake_up(&bdflush_done
); 2558 * If there are still a lot of dirty buffers around, 2559 * skip the sleep and flush some more. Otherwise, we 2560 * go to sleep waiting a wakeup. 2562 if(!flushed
||balance_dirty_state(NODEV
) <0) 2564 /* Remember to mark us as running otherwise 2565 the next schedule will block. */ 2566 __set_current_state(TASK_RUNNING
); 2571 * This is the kernel update daemon. It was used to live in userspace 2572 * but since it's need to run safely we want it unkillable by mistake. 2573 * You don't need to change your userspace configuration since 2574 * the userspace `update` will do_exit(0) at the first sys_bdflush(). 2576 intkupdate(void*sem
) 2578 struct task_struct
* tsk
= current
; 2583 strcpy(tsk
->comm
,"kupdate"); 2585 /* sigstop and sigcont will stop and wakeup kupdate */ 2586 spin_lock_irq(&tsk
->sigmask_lock
); 2587 sigfillset(&tsk
->blocked
); 2588 siginitsetinv(¤t
->blocked
,sigmask(SIGCONT
) |sigmask(SIGSTOP
)); 2589 recalc_sigpending(tsk
); 2590 spin_unlock_irq(&tsk
->sigmask_lock
); 2592 up((struct semaphore
*)sem
); 2595 /* update interval */ 2596 interval
= bdf_prm
.b_un
.interval
; 2598 tsk
->state
= TASK_INTERRUPTIBLE
; 2599 schedule_timeout(interval
); 2602 tsk
->state
= TASK_STOPPED
; 2603 schedule();/* wait for SIGCONT */ 2605 /* check for sigstop */ 2606 if(signal_pending(tsk
)) { 2608 spin_lock_irq(&tsk
->sigmask_lock
); 2609 if(sigismember(&tsk
->pending
.signal
, SIGSTOP
)) { 2610 sigdelset(&tsk
->pending
.signal
, SIGSTOP
); 2613 recalc_sigpending(tsk
); 2614 spin_unlock_irq(&tsk
->sigmask_lock
); 2619 printk("kupdate() activated...\n"); 2625 static int __init
bdflush_init(void) 2627 DECLARE_MUTEX_LOCKED(sem
); 2628 kernel_thread(bdflush
, &sem
, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL
); 2630 kernel_thread(kupdate
, &sem
, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL
); 2635 module_init(bdflush_init
)