4 * Copyright (C) 1991, 1992 Linus Torvalds 8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have 9 * been avoided by NEVER letting an interrupt change a buffer (except for the 10 * data, of course), but instead letting the caller do it. 13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */ 15 /* Removed a lot of unnecessary code and simplified things now that 16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating 20 * hash table, use SLAB cache for buffer heads. -DaveM 23 /* Added 32k buffer block sizes - these are required older ARM systems. 27 /* Thread it... -DaveM */ 29 #include <linux/sched.h> 31 #include <linux/malloc.h> 32 #include <linux/locks.h> 33 #include <linux/errno.h> 34 #include <linux/swap.h> 35 #include <linux/swapctl.h> 36 #include <linux/smp_lock.h> 37 #include <linux/vmalloc.h> 38 #include <linux/blkdev.h> 39 #include <linux/sysrq.h> 40 #include <linux/file.h> 41 #include <linux/init.h> 42 #include <linux/quotaops.h> 43 #include <linux/iobuf.h> 45 #include <asm/uaccess.h> 47 #include <asm/bitops.h> 48 #include <asm/mmu_context.h> 51 static char buffersize_index
[65] = 52 {-1,0,1, -1,2, -1, -1, -1,3, -1, -1, -1, -1, -1, -1, -1, 53 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 54 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 55 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, 58 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) 59 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512) 60 #define NR_RESERVED (2*MAX_BUF_PER_PAGE) 61 #define MAX_UNUSED_BUFFERS NR_RESERVED+20/* don't ever have more than this 62 number of unused buffer heads */ 64 /* Anti-deadlock ordering: 65 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock 71 static unsigned int bh_hash_mask
=0; 72 static unsigned int bh_hash_shift
=0; 73 static struct buffer_head
**hash_table
; 74 static rwlock_t hash_table_lock
= RW_LOCK_UNLOCKED
; 76 static struct buffer_head
*lru_list
[NR_LIST
]; 77 static spinlock_t lru_list_lock
= SPIN_LOCK_UNLOCKED
; 78 static int nr_buffers_type
[NR_LIST
] = {0,}; 80 static struct buffer_head
* unused_list
= NULL
; 81 static int nr_unused_buffer_heads
=0; 82 static spinlock_t unused_list_lock
= SPIN_LOCK_UNLOCKED
; 83 staticDECLARE_WAIT_QUEUE_HEAD(buffer_wait
); 86 struct buffer_head
*list
; 89 static struct bh_free_head free_list
[NR_SIZES
]; 91 static kmem_cache_t
*bh_cachep
; 93 static intgrow_buffers(int size
); 95 /* This is used by some architectures to estimate available memory. */ 96 atomic_t buffermem
=ATOMIC_INIT(0); 98 /* Here is the parameter block for the bdflush process. If you add or 99 * remove any of the parameters, make sure to update kernel/sysctl.c. 104 /* The dummy values in this structure are left in there for compatibility 105 * with old programs that play with the /proc entries. 107 union bdflush_param
{ 109 int nfract
;/* Percentage of buffer cache dirty to 111 int ndirty
;/* Maximum number of dirty blocks to write out per 113 int nrefill
;/* Number of clean buffers to try to obtain 114 each time we call refill */ 115 int nref_dirt
;/* Dirty buffer threshold for activating bdflush 116 when trying to refill buffers. */ 117 int dummy1
;/* unused */ 118 int age_buffer
;/* Time for normal buffer to age before we flush it */ 119 int age_super
;/* Time for superblock to age before we flush it */ 120 int dummy2
;/* unused */ 121 int dummy3
;/* unused */ 123 unsigned int data
[N_PARAM
]; 124 } bdf_prm
= {{40,500,64,256,15,30*HZ
,5*HZ
,1884,2}}; 126 /* These are the min and max parameter values that we will allow to be assigned */ 127 int bdflush_min
[N_PARAM
] = {0,10,5,25,0,1*HZ
,1*HZ
,1,1}; 128 int bdflush_max
[N_PARAM
] = {100,50000,20000,20000,1000,6000*HZ
,6000*HZ
,2047,5}; 130 voidwakeup_bdflush(int); 133 * Rewrote the wait-routines to use the "new" wait-queue functionality, 134 * and getting rid of the cli-sti pairs. The wait-queue routines still 135 * need cli-sti, but now it's just a couple of 386 instructions or so. 137 * Note that the real wait_on_buffer() is an inline function that checks 138 * if 'b_wait' is set before calling this, so that the queues aren't set 141 void__wait_on_buffer(struct buffer_head
* bh
) 143 struct task_struct
*tsk
= current
; 144 DECLARE_WAITQUEUE(wait
, tsk
); 146 atomic_inc(&bh
->b_count
); 147 add_wait_queue(&bh
->b_wait
, &wait
); 149 tsk
->state
= TASK_UNINTERRUPTIBLE
; 150 run_task_queue(&tq_disk
); 151 if(buffer_locked(bh
)) { 155 tsk
->state
= TASK_RUNNING
; 156 remove_wait_queue(&bh
->b_wait
, &wait
); 157 atomic_dec(&bh
->b_count
); 160 /* Call sync_buffers with wait!=0 to ensure that the call does not 161 * return until all buffer writes have completed. Sync() may return 162 * before the writes have finished; fsync() may not. 165 /* Godamity-damn. Some buffers (bitmaps for filesystems) 166 * spontaneously dirty themselves without ever brelse being called. 167 * We will ultimately want to put these in a separate list, but for 168 * now we search all of the lists for dirty buffers. 170 static intsync_buffers(kdev_t dev
,int wait
) 172 int i
, retry
, pass
=0, err
=0; 173 struct buffer_head
* bh
, *next
; 175 /* One pass for no-wait, three for wait: 176 * 0) write out all dirty, unlocked buffers; 177 * 1) write out all dirty buffers, waiting if locked; 178 * 2) wait for completion by waiting for all buffers to unlock. 183 /* We search all lists as a failsafe mechanism, not because we expect 184 * there to be dirty buffers on any of the other lists. 187 spin_lock(&lru_list_lock
); 188 bh
= lru_list
[BUF_DIRTY
]; 192 for(i
= nr_buffers_type
[BUF_DIRTY
]*2; i
-- >0; bh
= next
) { 193 next
= bh
->b_next_free
; 195 if(!lru_list
[BUF_DIRTY
]) 197 if(dev
&& bh
->b_dev
!= dev
) 199 if(buffer_locked(bh
)) { 200 /* Buffer is locked; skip it unless wait is 201 * requested AND pass > 0. 207 atomic_inc(&bh
->b_count
); 208 spin_unlock(&lru_list_lock
); 210 atomic_dec(&bh
->b_count
); 214 /* If an unlocked buffer is not uptodate, there has 215 * been an IO error. Skip it. 217 if(wait
&&buffer_req(bh
) && !buffer_locked(bh
) && 218 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) { 223 /* Don't write clean buffers. Don't write ANY buffers 226 if(!buffer_dirty(bh
) || pass
>=2) 229 atomic_inc(&bh
->b_count
); 231 spin_unlock(&lru_list_lock
); 232 ll_rw_block(WRITE
,1, &bh
); 233 atomic_dec(&bh
->b_count
); 239 bh
= lru_list
[BUF_LOCKED
]; 241 spin_unlock(&lru_list_lock
); 244 for(i
= nr_buffers_type
[BUF_LOCKED
]*2; i
-- >0; bh
= next
) { 245 next
= bh
->b_next_free
; 247 if(!lru_list
[BUF_LOCKED
]) 249 if(dev
&& bh
->b_dev
!= dev
) 251 if(buffer_locked(bh
)) { 252 /* Buffer is locked; skip it unless wait is 253 * requested AND pass > 0. 259 atomic_inc(&bh
->b_count
); 260 spin_unlock(&lru_list_lock
); 262 spin_lock(&lru_list_lock
); 263 atomic_dec(&bh
->b_count
); 267 spin_unlock(&lru_list_lock
); 269 /* If we are waiting for the sync to succeed, and if any dirty 270 * blocks were written, then repeat; on the second pass, only 271 * wait for buffers being written (do not pass to write any 272 * more buffers on the second pass). 274 }while(wait
&& retry
&& ++pass
<=2); 278 voidsync_dev(kdev_t dev
) 286 * FIXME(eric) we need to sync the physical devices here. 287 * This is because some (scsi) controllers have huge amounts of 288 * cache onboard (hundreds of Mb), and we need to instruct 289 * them to commit all of the dirty memory to disk, and we should 290 * not return until this has happened. 292 * This would need to get implemented by going through the assorted 293 * layers so that each block major number can be synced, and this 294 * would call down into the upper and mid-layer scsi. 298 intfsync_dev(kdev_t dev
) 308 returnsync_buffers(dev
,1); 311 asmlinkage
intsys_sync(void) 318 * filp may be NULL if called via the msync of a vma. 321 intfile_fsync(struct file
*filp
,struct dentry
*dentry
) 323 struct inode
* inode
= dentry
->d_inode
; 324 struct super_block
* sb
; 327 /* sync the inode to buffers */ 328 write_inode_now(inode
); 330 /* sync the superblock to buffers */ 333 if(sb
->s_op
&& sb
->s_op
->write_super
) 334 sb
->s_op
->write_super(sb
); 336 /* .. finally sync the buffers to disk */ 338 returnsync_buffers(dev
,1); 341 asmlinkage
intsys_fsync(unsigned int fd
) 344 struct dentry
* dentry
; 345 struct inode
* inode
; 354 dentry
= file
->f_dentry
; 358 inode
= dentry
->d_inode
; 363 if(!file
->f_op
|| !file
->f_op
->fsync
) 366 /* We need to protect against concurrent writers.. */ 368 err
= file
->f_op
->fsync(file
, dentry
); 378 asmlinkage
intsys_fdatasync(unsigned int fd
) 381 struct dentry
* dentry
; 382 struct inode
* inode
; 391 dentry
= file
->f_dentry
; 395 inode
= dentry
->d_inode
; 400 if(!file
->f_op
|| !file
->f_op
->fsync
) 403 /* this needs further work, at the moment it is identical to fsync() */ 405 err
= file
->f_op
->fsync(file
, dentry
); 415 voidinvalidate_buffers(kdev_t dev
) 419 spin_lock(&lru_list_lock
); 420 for(nlist
=0; nlist
< NR_LIST
; nlist
++) { 421 struct buffer_head
* bh
; 424 bh
= lru_list
[nlist
]; 427 for(i
= nr_buffers_type
[nlist
]*2; --i
>0; bh
= bh
->b_next_free
) { 430 if(buffer_locked(bh
)) { 431 atomic_inc(&bh
->b_count
); 432 spin_unlock(&lru_list_lock
); 434 spin_lock(&lru_list_lock
); 435 atomic_dec(&bh
->b_count
); 438 if(atomic_read(&bh
->b_count
)) 441 clear_bit(BH_Protected
, &bh
->b_state
); 442 clear_bit(BH_Uptodate
, &bh
->b_state
); 443 clear_bit(BH_Dirty
, &bh
->b_state
); 444 clear_bit(BH_Req
, &bh
->b_state
); 447 spin_unlock(&lru_list_lock
); 450 /* After several hours of tedious analysis, the following hash 451 * function won. Do not mess with it... -DaveM 453 #define _hashfn(dev,block) \ 454 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ 455 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12)))) 456 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)] 458 static __inline__
void__hash_link(struct buffer_head
*bh
,struct buffer_head
**head
) 460 if((bh
->b_next
= *head
) != NULL
) 461 bh
->b_next
->b_pprev
= &bh
->b_next
; 466 static __inline__
void__hash_unlink(struct buffer_head
*bh
) 469 bh
->b_next
->b_pprev
= bh
->b_pprev
; 470 *(bh
->b_pprev
) = bh
->b_next
; 474 static void__insert_into_lru_list(struct buffer_head
* bh
,int blist
) 476 struct buffer_head
**bhp
= &lru_list
[blist
]; 480 bh
->b_prev_free
= bh
; 482 bh
->b_next_free
= *bhp
; 483 bh
->b_prev_free
= (*bhp
)->b_prev_free
; 484 (*bhp
)->b_prev_free
->b_next_free
= bh
; 485 (*bhp
)->b_prev_free
= bh
; 486 nr_buffers_type
[blist
]++; 489 static void__remove_from_lru_list(struct buffer_head
* bh
,int blist
) 491 if(bh
->b_prev_free
|| bh
->b_next_free
) { 492 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
; 493 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
; 494 if(lru_list
[blist
] == bh
) 495 lru_list
[blist
] = bh
->b_next_free
; 496 if(lru_list
[blist
] == bh
) 497 lru_list
[blist
] = NULL
; 498 bh
->b_next_free
= bh
->b_prev_free
= NULL
; 499 nr_buffers_type
[blist
]--; 503 static void__remove_from_free_list(struct buffer_head
* bh
,int index
) 505 if(bh
->b_next_free
== bh
) 506 free_list
[index
].list
= NULL
; 508 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
; 509 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
; 510 if(free_list
[index
].list
== bh
) 511 free_list
[index
].list
= bh
->b_next_free
; 513 bh
->b_next_free
= bh
->b_prev_free
= NULL
; 516 /* The following two functions must operate atomically 517 * because they control the visibility of a buffer head 518 * to the rest of the kernel. 520 static __inline__
void__remove_from_queues(struct buffer_head
*bh
) 522 write_lock(&hash_table_lock
); 525 __remove_from_lru_list(bh
, bh
->b_list
); 526 write_unlock(&hash_table_lock
); 529 static voidinsert_into_queues(struct buffer_head
*bh
) 531 struct buffer_head
**head
= &hash(bh
->b_dev
, bh
->b_blocknr
); 533 spin_lock(&lru_list_lock
); 534 write_lock(&hash_table_lock
); 535 __hash_link(bh
, head
); 536 __insert_into_lru_list(bh
, bh
->b_list
); 537 write_unlock(&hash_table_lock
); 538 spin_unlock(&lru_list_lock
); 541 /* This function must only run if there are no other 542 * references _anywhere_ to this buffer head. 544 static voidput_last_free(struct buffer_head
* bh
) 546 struct bh_free_head
*head
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)]; 547 struct buffer_head
**bhp
= &head
->list
; 549 spin_lock(&head
->lock
); 553 bh
->b_prev_free
= bh
; 555 bh
->b_next_free
= *bhp
; 556 bh
->b_prev_free
= (*bhp
)->b_prev_free
; 557 (*bhp
)->b_prev_free
->b_next_free
= bh
; 558 (*bhp
)->b_prev_free
= bh
; 559 spin_unlock(&head
->lock
); 563 * Why like this, I hear you say... The reason is race-conditions. 564 * As we don't lock buffers (unless we are reading them, that is), 565 * something might happen to it while we sleep (ie a read-error 566 * will force it bad). This shouldn't really happen currently, but 569 struct buffer_head
*get_hash_table(kdev_t dev
,int block
,int size
) 571 struct buffer_head
**head
= &hash(dev
, block
); 572 struct buffer_head
*bh
; 574 read_lock(&hash_table_lock
); 575 for(bh
= *head
; bh
; bh
= bh
->b_next
) 576 if(bh
->b_blocknr
== block
&& 577 bh
->b_size
== size
&& 581 atomic_inc(&bh
->b_count
); 582 read_unlock(&hash_table_lock
); 587 unsigned intget_hardblocksize(kdev_t dev
) 590 * Get the hard sector size for the given device. If we don't know 591 * what it is, return 0. 593 if(hardsect_size
[MAJOR(dev
)] != NULL
) { 594 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)]; 600 * We don't know what the hardware sector size for this device is. 601 * Return 0 indicating that we don't know. 606 voidset_blocksize(kdev_t dev
,int size
) 608 externint*blksize_size
[]; 610 struct buffer_head
* bh
, *bhnext
; 612 if(!blksize_size
[MAJOR(dev
)]) 615 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 616 if(size
> PAGE_SIZE
|| size
<512|| (size
& (size
-1))) 617 panic("Invalid blocksize passed to set_blocksize"); 619 if(blksize_size
[MAJOR(dev
)][MINOR(dev
)] ==0&& size
== BLOCK_SIZE
) { 620 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
; 623 if(blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
) 626 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
; 628 /* We need to be quite careful how we do this - we are moving entries 629 * around on the free list, and we can get in a loop if we are not careful. 631 for(nlist
=0; nlist
< NR_LIST
; nlist
++) { 633 spin_lock(&lru_list_lock
); 634 bh
= lru_list
[nlist
]; 635 for(i
= nr_buffers_type
[nlist
]*2; --i
>0; bh
= bhnext
) { 639 bhnext
= bh
->b_next_free
; 642 if(bh
->b_size
== size
) 644 if(buffer_locked(bh
)) { 645 atomic_inc(&bh
->b_count
); 646 spin_unlock(&lru_list_lock
); 648 atomic_dec(&bh
->b_count
); 651 if(bh
->b_dev
== dev
&& bh
->b_size
!= size
) { 652 clear_bit(BH_Dirty
, &bh
->b_state
); 653 clear_bit(BH_Uptodate
, &bh
->b_state
); 654 clear_bit(BH_Req
, &bh
->b_state
); 657 if(atomic_read(&bh
->b_count
) ==0) { 658 __remove_from_queues(bh
); 662 spin_unlock(&lru_list_lock
); 667 * We used to try various strange things. Let's not. 669 static voidrefill_freelist(int size
) 671 if(!grow_buffers(size
)) { 673 current
->policy
|= SCHED_YIELD
; 678 voidinit_buffer(struct buffer_head
*bh
, bh_end_io_t
*handler
,void*dev_id
) 680 bh
->b_list
= BUF_CLEAN
; 682 bh
->b_end_io
= handler
; 683 bh
->b_dev_id
= dev_id
; 686 static voidend_buffer_io_sync(struct buffer_head
*bh
,int uptodate
) 688 mark_buffer_uptodate(bh
, uptodate
); 692 static voidend_buffer_io_bad(struct buffer_head
*bh
,int uptodate
) 694 mark_buffer_uptodate(bh
, uptodate
); 699 static voidend_buffer_io_async(struct buffer_head
* bh
,int uptodate
) 701 static spinlock_t page_uptodate_lock
= SPIN_LOCK_UNLOCKED
; 703 struct buffer_head
*tmp
; 707 mark_buffer_uptodate(bh
, uptodate
); 709 /* This is a temporary buffer used for page I/O. */ 710 page
= mem_map
+MAP_NR(bh
->b_data
); 716 * Be _very_ careful from here on. Bad things can happen if 717 * two buffer heads end IO at almost the same time and both 718 * decide that the page is now completely done. 720 * Async buffer_heads are here only as labels for IO, and get 721 * thrown away once the IO for this page is complete. IO is 722 * deemed complete once all buffers have been visited 723 * (b_count==0) and are now unlocked. We must make sure that 724 * only the _last_ buffer that decrements its count is the one 725 * that free's the page.. 727 spin_lock_irqsave(&page_uptodate_lock
, flags
); 729 atomic_dec(&bh
->b_count
); 730 tmp
= bh
->b_this_page
; 732 if(atomic_read(&tmp
->b_count
) && 733 (tmp
->b_end_io
== end_buffer_io_async
)) 735 tmp
= tmp
->b_this_page
; 738 /* OK, the async IO on this page is complete. */ 739 spin_unlock_irqrestore(&page_uptodate_lock
, flags
); 742 * if none of the buffers had errors then we can set the 746 SetPageUptodate(page
); 749 * Run the hooks that have to be done when a page I/O has completed. 751 * Note - we need to test the flags before we unlock the page, but 752 * we must not actually free the page until after the unlock! 754 if(test_and_clear_bit(PG_decr_after
, &page
->flags
)) 755 atomic_dec(&nr_async_pages
); 757 if(test_and_clear_bit(PG_free_swap_after
, &page
->flags
)) 758 swap_free(page
->offset
); 760 free
=test_and_clear_bit(PG_free_after
, &page
->flags
); 762 if(page
->owner
!= (void*)-1) 764 page
->owner
= current
; 773 spin_unlock_irqrestore(&page_uptodate_lock
, flags
); 779 * Ok, this is getblk, and it isn't very clear, again to hinder 780 * race-conditions. Most of the code is seldom used, (ie repeating), 781 * so it should be much more efficient than it looks. 783 * The algorithm is changed: hopefully better, and an elusive bug removed. 785 * 14.02.92: changed it to sync dirty buffers a bit: better performance 786 * when the filesystem starts to get full of dirty blocks (I hope). 788 struct buffer_head
*getblk(kdev_t dev
,int block
,int size
) 790 struct buffer_head
* bh
; 794 bh
=get_hash_table(dev
, block
, size
); 796 if(!buffer_dirty(bh
)) { 802 isize
=BUFSIZE_INDEX(size
); 803 spin_lock(&free_list
[isize
].lock
); 804 bh
= free_list
[isize
].list
; 806 __remove_from_free_list(bh
, isize
); 807 atomic_set(&bh
->b_count
,1); 809 spin_unlock(&free_list
[isize
].lock
); 813 /* OK, FINALLY we know that this buffer is the only one of its kind, 814 * we hold a reference (b_count>0), it is unlocked, and it is clean. 816 init_buffer(bh
, end_buffer_io_sync
, NULL
); 818 bh
->b_blocknr
= block
; 819 bh
->b_state
=1<< BH_Mapped
; 821 /* Insert the buffer into the regular lists */ 822 insert_into_queues(bh
); 826 * If we block while refilling the free list, somebody may 827 * create the buffer first ... search the hashes again. 830 refill_freelist(size
); 837 * if a new dirty buffer is created we need to balance bdflush. 839 * in the future we might want to make bdflush aware of different 840 * pressures on different devices - thus the (currently unused) 843 int too_many_dirty_buffers
; 845 voidbalance_dirty(kdev_t dev
) 847 int dirty
= nr_buffers_type
[BUF_DIRTY
]; 848 int ndirty
= bdf_prm
.b_un
.ndirty
; 851 if(dirty
>2*ndirty
) { 852 too_many_dirty_buffers
=1; 858 too_many_dirty_buffers
=0; 862 staticinlinevoid__mark_dirty(struct buffer_head
*bh
,int flag
) 864 bh
->b_flushtime
= jiffies
+ (flag
? bdf_prm
.b_un
.age_super
: bdf_prm
.b_un
.age_buffer
); 865 clear_bit(BH_New
, &bh
->b_state
); 869 void__mark_buffer_dirty(struct buffer_head
*bh
,int flag
) 871 __mark_dirty(bh
, flag
); 875 * A buffer may need to be moved from one buffer list to another 876 * (e.g. in case it is not shared any more). Handle this. 878 static __inline__
void__refile_buffer(struct buffer_head
*bh
) 880 int dispose
= BUF_CLEAN
; 881 if(buffer_locked(bh
)) 882 dispose
= BUF_LOCKED
; 885 if(dispose
!= bh
->b_list
) { 886 __remove_from_lru_list(bh
, bh
->b_list
); 887 bh
->b_list
= dispose
; 888 __insert_into_lru_list(bh
, dispose
); 892 voidrefile_buffer(struct buffer_head
*bh
) 894 spin_lock(&lru_list_lock
); 896 spin_unlock(&lru_list_lock
); 900 * Release a buffer head 902 void__brelse(struct buffer_head
* buf
) 906 if(atomic_read(&buf
->b_count
)) { 907 atomic_dec(&buf
->b_count
); 910 printk("VFS: brelse: Trying to free free buffer\n"); 914 * bforget() is like brelse(), except it puts the buffer on the 915 * free list if it can.. We can NOT free the buffer if: 916 * - there are other users of it 917 * - it is locked and thus can have active IO 919 void__bforget(struct buffer_head
* buf
) 921 spin_lock(&lru_list_lock
); 922 write_lock(&hash_table_lock
); 923 if(atomic_read(&buf
->b_count
) !=1||buffer_locked(buf
)) { 925 atomic_dec(&buf
->b_count
); 927 atomic_set(&buf
->b_count
,0); 931 __remove_from_lru_list(buf
, buf
->b_list
); 934 write_unlock(&hash_table_lock
); 935 spin_unlock(&lru_list_lock
); 939 * bread() reads a specified block and returns the buffer that contains 940 * it. It returns NULL if the block was unreadable. 942 struct buffer_head
*bread(kdev_t dev
,int block
,int size
) 944 struct buffer_head
* bh
; 946 bh
=getblk(dev
, block
, size
); 947 if(buffer_uptodate(bh
)) 949 ll_rw_block(READ
,1, &bh
); 951 if(buffer_uptodate(bh
)) 958 * Ok, breada can be used as bread, but additionally to mark other 959 * blocks for reading as well. End the argument list with a negative 965 struct buffer_head
*breada(kdev_t dev
,int block
,int bufsize
, 966 unsigned int pos
,unsigned int filesize
) 968 struct buffer_head
* bhlist
[NBUF
]; 970 struct buffer_head
* bh
; 980 bh
=getblk(dev
, block
, bufsize
); 981 index
=BUFSIZE_INDEX(bh
->b_size
); 983 if(buffer_uptodate(bh
)) 985 elsell_rw_block(READ
,1, &bh
); 987 blocks
= (filesize
- pos
) >> (9+index
); 989 if(blocks
< (read_ahead
[MAJOR(dev
)] >> index
)) 990 blocks
= read_ahead
[MAJOR(dev
)] >> index
; 994 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */ 998 for(i
=1; i
<blocks
; i
++) { 999 bh
=getblk(dev
,block
+i
,bufsize
); 1000 if(buffer_uptodate(bh
)) { 1004 else bhlist
[j
++] = bh
; 1007 /* Request the read for these buffers, and then release them. */ 1009 ll_rw_block(READA
, (j
-1), bhlist
+1); 1013 /* Wait for this buffer, and then continue on. */ 1016 if(buffer_uptodate(bh
)) 1023 * Note: the caller should wake up the buffer_wait list if needed. 1025 static __inline__
void__put_unused_buffer_head(struct buffer_head
* bh
) 1027 if(nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) { 1028 kmem_cache_free(bh_cachep
, bh
); 1031 init_waitqueue_head(&bh
->b_wait
); 1032 nr_unused_buffer_heads
++; 1033 bh
->b_next_free
= unused_list
; 1034 bh
->b_this_page
= NULL
; 1039 static voidput_unused_buffer_head(struct buffer_head
*bh
) 1041 spin_lock(&unused_list_lock
); 1042 __put_unused_buffer_head(bh
); 1043 spin_unlock(&unused_list_lock
); 1047 * Reserve NR_RESERVED buffer heads for async IO requests to avoid 1048 * no-buffer-head deadlock. Return NULL on failure; waiting for 1049 * buffer heads is now handled in create_buffers(). 1051 static struct buffer_head
*get_unused_buffer_head(int async
) 1053 struct buffer_head
* bh
; 1055 spin_lock(&unused_list_lock
); 1056 if(nr_unused_buffer_heads
> NR_RESERVED
) { 1058 unused_list
= bh
->b_next_free
; 1059 nr_unused_buffer_heads
--; 1060 spin_unlock(&unused_list_lock
); 1063 spin_unlock(&unused_list_lock
); 1065 /* This is critical. We can't swap out pages to get 1066 * more buffer heads, because the swap-out may need 1067 * more buffer-heads itself. Thus SLAB_BUFFER. 1069 if((bh
=kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) { 1070 memset(bh
,0,sizeof(*bh
)); 1071 init_waitqueue_head(&bh
->b_wait
); 1076 * If we need an async buffer, use the reserved buffer heads. 1079 spin_lock(&unused_list_lock
); 1082 unused_list
= bh
->b_next_free
; 1083 nr_unused_buffer_heads
--; 1084 spin_unlock(&unused_list_lock
); 1087 spin_unlock(&unused_list_lock
); 1091 * (Pending further analysis ...) 1092 * Ordinary (non-async) requests can use a different memory priority 1093 * to free up pages. Any swapping thus generated will use async 1097 (bh
=kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) { 1098 memset(bh
,0,sizeof(*bh
)); 1099 init_waitqueue_head(&bh
->b_wait
); 1108 * Create the appropriate buffers when given a page for data area and 1109 * the size of each buffer.. Use the bh->b_this_page linked list to 1110 * follow the buffers created. Return NULL if unable to create more 1112 * The async flag is used to differentiate async IO (paging, swapping) 1113 * from ordinary buffer allocations, and only async requests are allowed 1114 * to sleep waiting for buffer heads. 1116 static struct buffer_head
*create_buffers(unsigned long page
,unsigned long size
,int async
) 1118 DECLARE_WAITQUEUE(wait
, current
); 1119 struct buffer_head
*bh
, *head
; 1125 while((offset
-= size
) >=0) { 1126 bh
=get_unused_buffer_head(async
); 1130 bh
->b_dev
= B_FREE
;/* Flag as unused */ 1131 bh
->b_this_page
= head
; 1135 bh
->b_next_free
= NULL
; 1137 atomic_set(&bh
->b_count
,0); 1140 bh
->b_data
= (char*) (page
+offset
); 1141 bh
->b_list
= BUF_CLEAN
; 1143 bh
->b_end_io
= end_buffer_io_bad
; 1147 * In case anything failed, we just free everything we got. 1151 spin_lock(&unused_list_lock
); 1154 head
= head
->b_this_page
; 1155 __put_unused_buffer_head(bh
); 1157 spin_unlock(&unused_list_lock
); 1159 /* Wake up any waiters ... */ 1160 wake_up(&buffer_wait
); 1164 * Return failure for non-async IO requests. Async IO requests 1165 * are not allowed to fail, so we have to wait until buffer heads 1166 * become available. But we don't want tasks sleeping with 1167 * partially complete buffers, so all were released above. 1172 /* We're _really_ low on memory. Now we just 1173 * wait for old buffer heads to become free due to 1174 * finishing IO. Since this is an async request and 1175 * the reserve list is empty, we're sure there are 1176 * async buffer heads in use. 1178 run_task_queue(&tq_disk
); 1181 * Set our state for sleeping, then check again for buffer heads. 1182 * This ensures we won't miss a wake_up from an interrupt. 1184 add_wait_queue(&buffer_wait
, &wait
); 1185 current
->state
= TASK_UNINTERRUPTIBLE
; 1186 if(nr_unused_buffer_heads
< MAX_BUF_PER_PAGE
) { 1187 current
->policy
|= SCHED_YIELD
; 1190 remove_wait_queue(&buffer_wait
, &wait
); 1191 current
->state
= TASK_RUNNING
; 1195 static intcreate_page_buffers(int rw
,struct page
*page
, kdev_t dev
,int b
[],int size
,int bmap
) 1197 struct buffer_head
*head
, *bh
, *tail
; 1200 if(!PageLocked(page
)) 1202 if(page
->owner
!= current
) 1205 * Allocate async buffer heads pointing to this page, just for I/O. 1206 * They don't show up in the buffer hash table, but they *are* 1207 * registered in page->buffers. 1209 head
=create_buffers(page_address(page
), size
,1); 1215 for(bh
= head
; bh
; bh
= bh
->b_this_page
) { 1219 init_buffer(bh
, end_buffer_io_async
, NULL
); 1221 bh
->b_blocknr
= block
; 1224 * When we use bmap, we define block zero to represent 1225 * a hole. ll_rw_page, however, may legitimately 1226 * access block zero, and we need to distinguish the 1229 if(bmap
&& !block
) { 1230 memset(bh
->b_data
,0, size
); 1231 set_bit(BH_Uptodate
, &bh
->b_state
); 1234 set_bit(BH_Mapped
, &bh
->b_state
); 1236 tail
->b_this_page
= head
; 1238 page
->buffers
= head
; 1243 * We don't have to release all buffers here, but 1244 * we have to be sure that no dirty buffer is left 1245 * and no IO is going on (no buffer is locked), because 1246 * we have truncated the file and are going to free the 1249 intblock_flushpage(struct inode
*inode
,struct page
*page
,unsigned long offset
) 1251 struct buffer_head
*head
, *bh
, *next
; 1252 unsigned int curr_off
=0; 1254 if(!PageLocked(page
)) 1259 head
= page
->buffers
; 1262 unsigned int next_off
= curr_off
+ bh
->b_size
; 1263 next
= bh
->b_this_page
; 1266 * is this block fully flushed? 1268 if(offset
<= curr_off
) { 1269 if(buffer_mapped(bh
)) { 1270 atomic_inc(&bh
->b_count
); 1272 if(bh
->b_dev
== B_FREE
) 1274 mark_buffer_clean(bh
); 1275 clear_bit(BH_Uptodate
, &bh
->b_state
); 1276 clear_bit(BH_Mapped
, &bh
->b_state
); 1277 clear_bit(BH_Req
, &bh
->b_state
); 1279 atomic_dec(&bh
->b_count
); 1282 curr_off
= next_off
; 1287 * subtle. We release buffer-heads only if this is 1288 * the 'final' flushpage. We have invalidated the bmap 1289 * cached value unconditionally, so real IO is not 1292 * If the free doesn't work out, the buffers can be 1293 * left around - they just turn into anonymous buffers 1297 if(!try_to_free_buffers(page
)) 1298 atomic_add(PAGE_CACHE_SIZE
, &buffermem
); 1304 static voidcreate_empty_buffers(struct page
*page
,struct inode
*inode
,unsigned long blocksize
) 1306 struct buffer_head
*bh
, *head
, *tail
; 1308 head
=create_buffers(page_address(page
), blocksize
,1); 1314 bh
->b_dev
= inode
->i_dev
; 1316 bh
->b_end_io
= end_buffer_io_bad
; 1318 bh
= bh
->b_this_page
; 1320 tail
->b_this_page
= head
; 1321 page
->buffers
= head
; 1326 * block_write_full_page() is SMP-safe - currently it's still 1327 * being called with the kernel lock held, but the code is ready. 1329 intblock_write_full_page(struct file
*file
,struct page
*page
) 1331 struct dentry
*dentry
= file
->f_dentry
; 1332 struct inode
*inode
= dentry
->d_inode
; 1334 unsigned long block
, offset
; 1335 struct buffer_head
*bh
, *head
; 1337 if(!PageLocked(page
)) 1341 create_empty_buffers(page
, inode
, inode
->i_sb
->s_blocksize
); 1342 head
= page
->buffers
; 1344 offset
= page
->offset
; 1345 block
= offset
>> inode
->i_sb
->s_blocksize_bits
; 1347 // FIXME: currently we assume page alignment. 1348 if(offset
& (PAGE_SIZE
-1)) 1358 * If the buffer isn't up-to-date, we can't be sure 1359 * that the buffer has been initialized with the proper 1360 * block number information etc.. 1362 * Leave it to the low-level FS to make all those 1363 * decisions (block #0 may actually be a valid block) 1365 bh
->b_end_io
= end_buffer_io_sync
; 1366 if(!buffer_mapped(bh
)) { 1367 err
= inode
->i_op
->get_block(inode
, block
, bh
,1); 1371 set_bit(BH_Uptodate
, &bh
->b_state
); 1372 mark_buffer_dirty(bh
,0); 1374 bh
= bh
->b_this_page
; 1378 SetPageUptodate(page
); 1381 ClearPageUptodate(page
); 1385 intblock_write_partial_page(struct file
*file
,struct page
*page
,unsigned long offset
,unsigned long bytes
,const char* buf
) 1387 struct dentry
*dentry
= file
->f_dentry
; 1388 struct inode
*inode
= dentry
->d_inode
; 1389 unsigned long block
; 1391 unsigned long blocksize
, start_block
, end_block
; 1392 unsigned long start_offset
, start_bytes
, end_bytes
; 1393 unsigned long bbits
, blocks
, i
, len
; 1394 struct buffer_head
*bh
, *head
; 1397 target_buf
= (char*)page_address(page
) + offset
; 1399 if(!PageLocked(page
)) 1402 blocksize
= inode
->i_sb
->s_blocksize
; 1404 create_empty_buffers(page
, inode
, blocksize
); 1405 head
= page
->buffers
; 1407 bbits
= inode
->i_sb
->s_blocksize_bits
; 1408 block
= page
->offset
>> bbits
; 1409 blocks
= PAGE_SIZE
>> bbits
; 1410 start_block
= offset
>> bbits
; 1411 end_block
= (offset
+ bytes
-1) >> bbits
; 1412 start_offset
= offset
& (blocksize
-1); 1413 start_bytes
= blocksize
- start_offset
; 1414 if(start_bytes
> bytes
) 1415 start_bytes
= bytes
; 1416 end_bytes
= (offset
+bytes
) & (blocksize
-1); 1417 if(end_bytes
> bytes
) 1420 if(offset
<0|| offset
>= PAGE_SIZE
) 1422 if(bytes
+offset
<0|| bytes
+offset
> PAGE_SIZE
) 1424 if(start_block
<0|| start_block
>= blocks
) 1426 if(end_block
<0|| end_block
>= blocks
) 1428 // FIXME: currently we assume page alignment. 1429 if(page
->offset
& (PAGE_SIZE
-1)) 1439 if((i
< start_block
) || (i
> end_block
)) { 1440 if(!buffer_uptodate(bh
)) 1446 * If the buffer is not up-to-date, we need to ask the low-level 1447 * FS to do something for us (we used to have assumptions about 1448 * the meaning of b_blocknr etc, that's bad). 1450 * If "update" is set, that means that the low-level FS should 1451 * try to make sure that the block is up-to-date because we're 1452 * not going to fill it completely. 1454 bh
->b_end_io
= end_buffer_io_sync
; 1455 if(!buffer_mapped(bh
)) { 1456 err
= inode
->i_op
->get_block(inode
, block
, bh
,1); 1461 if(!buffer_uptodate(bh
) && (start_offset
|| (end_bytes
&& (i
== end_block
)))) { 1462 if(buffer_new(bh
)) { 1463 memset(bh
->b_data
,0, bh
->b_size
); 1465 ll_rw_block(READ
,1, &bh
); 1468 if(!buffer_uptodate(bh
)) 1477 }else if(end_bytes
&& (i
== end_block
)) { 1481 err
=copy_from_user(target_buf
, buf
, len
); 1486 * we dirty buffers only after copying the data into 1487 * the page - this way we can dirty the buffer even if 1488 * the bh is still doing IO. 1490 * NOTE! This also does a direct dirty balace check, 1491 * rather than relying on bdflush just waking up every 1492 * once in a while. This is to catch (and slow down) 1493 * the processes that write tons of buffer.. 1495 * Note how we do NOT want to do this in the full block 1496 * case: full pages are flushed not by the people who 1497 * dirtied them, but by people who need memory. And we 1498 * should not penalize them for somebody else writing 1499 * lots of dirty pages. 1501 set_bit(BH_Uptodate
, &bh
->b_state
); 1502 if(!test_and_set_bit(BH_Dirty
, &bh
->b_state
)) { 1504 if(too_many_dirty_buffers
) 1505 balance_dirty(bh
->b_dev
); 1516 bh
= bh
->b_this_page
; 1520 * is this a partial write that happened to make all buffers 1521 * uptodate then we can optimize away a bogus readpage() for 1522 * the next read(). Here we 'discover' wether the page went 1523 * uptodate as a result of this (potentially partial) write. 1526 SetPageUptodate(page
); 1529 ClearPageUptodate(page
); 1535 * IO completion routine for a buffer_head being used for kiobuf IO: we 1536 * can't dispatch the kiobuf callback until io_count reaches 0. 1539 static voidend_buffer_io_kiobuf(struct buffer_head
*bh
,int uptodate
) 1541 struct kiobuf
*kiobuf
; 1543 mark_buffer_uptodate(bh
, uptodate
); 1545 kiobuf
= bh
->b_kiobuf
; 1546 if(atomic_dec_and_test(&kiobuf
->io_count
)) 1547 kiobuf
->end_io(kiobuf
); 1549 kiobuf
->errno
= -EIO
; 1554 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait 1555 * for them to complete. Clean up the buffer_heads afterwards. 1558 #define dprintk(x...) 1560 static intdo_kio(struct kiobuf
*kiobuf
, 1561 int rw
,int nr
,struct buffer_head
*bh
[],int size
) 1565 struct buffer_head
*tmp
; 1567 struct task_struct
*tsk
= current
; 1568 DECLARE_WAITQUEUE(wait
, tsk
); 1570 dprintk("do_kio start %d\n", rw
); 1574 atomic_add(nr
, &kiobuf
->io_count
); 1576 ll_rw_block(rw
, nr
, bh
); 1578 kiobuf_wait_for_io(kiobuf
); 1580 spin_lock(&unused_list_lock
); 1583 for(i
= nr
; --i
>=0; ) { 1586 if(!buffer_uptodate(tmp
)) { 1587 /* We are traversing bh'es in reverse order so 1588 clearing iosize on error calculates the 1589 amount of IO before the first error. */ 1592 __put_unused_buffer_head(tmp
); 1595 spin_unlock(&unused_list_lock
); 1597 dprintk("do_kio end %d %d\n", iosize
, err
); 1602 return kiobuf
->errno
; 1607 * Start I/O on a physical range of kernel memory, defined by a vector 1608 * of kiobuf structs (much like a user-space iovec list). 1610 * The kiobuf must already be locked for IO. IO is submitted 1611 * asynchronously: you need to check page->locked, page->uptodate, and 1612 * maybe wait on page->wait. 1614 * It is up to the caller to make sure that there are enough blocks 1615 * passed in to completely map the iobufs to disk. 1618 intbrw_kiovec(int rw
,int nr
,struct kiobuf
*iovec
[], 1619 kdev_t dev
,unsigned long b
[],int size
,int bmap
) 1629 unsigned long blocknr
; 1630 struct kiobuf
* iobuf
= NULL
; 1633 struct buffer_head
*tmp
, *bh
[KIO_MAX_SECTORS
]; 1639 * First, do some alignment and validity checks 1641 for(i
=0; i
< nr
; i
++) { 1643 if((iobuf
->offset
& (size
-1)) || 1644 (iobuf
->length
& (size
-1))) 1647 panic("brw_kiovec: iobuf not locked for I/O"); 1648 if(!iobuf
->nr_pages
) 1649 panic("brw_kiovec: iobuf not initialised"); 1654 return iobuf
->length
; 1656 dprintk("brw_kiovec: start\n"); 1659 * OK to walk down the iovec doing page IO on each page we find. 1661 bufind
= bhind
= transferred
= err
=0; 1662 for(i
=0; i
< nr
; i
++) { 1664 offset
= iobuf
->offset
; 1665 length
= iobuf
->length
; 1666 dprintk("iobuf %d %d %d\n", offset
, length
, size
); 1668 for(pageind
=0; pageind
< iobuf
->nr_pages
; pageind
++) { 1669 page
= iobuf
->pagelist
[pageind
]; 1670 map
= iobuf
->maplist
[pageind
]; 1673 blocknr
= b
[bufind
++]; 1674 tmp
=get_unused_buffer_head(0); 1680 tmp
->b_dev
= B_FREE
; 1682 tmp
->b_data
= (char*) (page
+ offset
); 1683 tmp
->b_this_page
= tmp
; 1685 init_buffer(tmp
, end_buffer_io_kiobuf
, NULL
); 1687 tmp
->b_blocknr
= blocknr
; 1688 tmp
->b_state
=1<< BH_Mapped
; 1689 tmp
->b_kiobuf
= iobuf
; 1692 set_bit(BH_Uptodate
, &tmp
->b_state
); 1693 set_bit(BH_Dirty
, &tmp
->b_state
); 1696 dprintk("buffer %d (%d) at %p\n", 1697 bhind
, tmp
->b_blocknr
, tmp
->b_data
); 1703 * Start the IO if we have got too much 1705 if(bhind
>= KIO_MAX_SECTORS
) { 1706 err
=do_kio(iobuf
, rw
, bhind
, bh
, size
); 1714 if(offset
>= PAGE_SIZE
) { 1718 }/* End of block loop */ 1719 }/* End of page loop */ 1720 }/* End of iovec loop */ 1722 /* Is there any IO still left to submit? */ 1724 err
=do_kio(iobuf
, rw
, bhind
, bh
, size
); 1732 dprintk("brw_kiovec: end (%d, %d)\n", transferred
, err
); 1738 /* We got an error allocation the bh'es. Just free the current 1739 buffer_heads and exit. */ 1740 spin_lock(&unused_list_lock
); 1741 for(i
= bhind
; --i
>=0; ) { 1742 __put_unused_buffer_head(bh
[bhind
]); 1744 spin_unlock(&unused_list_lock
); 1749 * Start I/O on a page. 1750 * This function expects the page to be locked and may return 1751 * before I/O is complete. You then have to check page->locked, 1752 * page->uptodate, and maybe wait on page->wait. 1754 * brw_page() is SMP-safe, although it's being called with the 1755 * kernel lock held - but the code is ready. 1757 * FIXME: we need a swapper_inode->get_block function to remove 1758 * some of the bmap kludges and interface ugliness here. 1760 intbrw_page(int rw
,struct page
*page
, kdev_t dev
,int b
[],int size
,int bmap
) 1762 struct buffer_head
*head
, *bh
, *arr
[MAX_BUF_PER_PAGE
]; 1763 int nr
, fresh
/* temporary debugging flag */, block
; 1765 if(!PageLocked(page
)) 1766 panic("brw_page: page not locked for I/O"); 1767 // clear_bit(PG_error, &page->flags); 1769 * We pretty much rely on the page lock for this, because 1770 * create_page_buffers() might sleep. 1773 if(!page
->buffers
) { 1774 create_page_buffers(rw
, page
, dev
, b
, size
, bmap
); 1779 page
->owner
= (void*)-1; 1781 head
= page
->buffers
; 1787 if(fresh
&& (atomic_read(&bh
->b_count
) !=0)) 1792 if(bmap
&& !block
) { 1798 if(!buffer_uptodate(bh
)) { 1800 atomic_inc(&bh
->b_count
); 1804 if(!bh
->b_blocknr
) { 1807 bh
->b_blocknr
= block
; 1812 set_bit(BH_Uptodate
, &bh
->b_state
); 1813 set_bit(BH_Dirty
, &bh
->b_state
); 1815 atomic_inc(&bh
->b_count
); 1817 bh
= bh
->b_this_page
; 1821 if((rw
== READ
) && nr
) { 1822 if(Page_Uptodate(page
)) 1824 ll_rw_block(rw
, nr
, arr
); 1826 if(!nr
&& rw
== READ
) { 1827 SetPageUptodate(page
); 1828 page
->owner
= current
; 1831 if(nr
&& (rw
== WRITE
)) 1832 ll_rw_block(rw
, nr
, arr
); 1838 * Generic "read page" function for block devices that have the normal 1839 * bmap functionality. This is most of the block device filesystems. 1840 * Reads the page asynchronously --- the unlock_buffer() and 1841 * mark_buffer_uptodate() functions propagate buffer state into the 1842 * page struct once IO has completed. 1844 intblock_read_full_page(struct file
* file
,struct page
* page
) 1846 struct dentry
*dentry
= file
->f_dentry
; 1847 struct inode
*inode
= dentry
->d_inode
; 1848 unsigned long iblock
; 1849 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
]; 1850 unsigned int blocksize
, blocks
; 1853 if(!PageLocked(page
)) 1855 blocksize
= inode
->i_sb
->s_blocksize
; 1857 create_empty_buffers(page
, inode
, blocksize
); 1858 head
= page
->buffers
; 1860 blocks
= PAGE_SIZE
>> inode
->i_sb
->s_blocksize_bits
; 1861 iblock
= page
->offset
>> inode
->i_sb
->s_blocksize_bits
; 1862 page
->owner
= (void*)-1; 1867 if(buffer_uptodate(bh
)) 1870 if(!buffer_mapped(bh
)) { 1871 inode
->i_op
->get_block(inode
, iblock
, bh
,0); 1872 if(!buffer_mapped(bh
)) { 1873 memset(bh
->b_data
,0, blocksize
); 1874 set_bit(BH_Uptodate
, &bh
->b_state
); 1879 init_buffer(bh
, end_buffer_io_async
, NULL
); 1880 atomic_inc(&bh
->b_count
); 1883 }while(iblock
++, (bh
= bh
->b_this_page
) != head
); 1887 if(Page_Uptodate(page
)) 1889 ll_rw_block(READ
, nr
, arr
); 1892 * all buffers are uptodate - we can set the page 1895 SetPageUptodate(page
); 1896 page
->owner
= current
; 1903 * Try to increase the number of buffers available: the size argument 1904 * is used to determine what kind of buffers we want. 1906 static intgrow_buffers(int size
) 1909 struct buffer_head
*bh
, *tmp
; 1910 struct buffer_head
* insert_point
; 1913 if((size
&511) || (size
> PAGE_SIZE
)) { 1914 printk("VFS: grow_buffers: size = %d\n",size
); 1918 if(!(page
=__get_free_page(GFP_BUFFER
))) 1920 bh
=create_buffers(page
, size
,0); 1926 isize
=BUFSIZE_INDEX(size
); 1928 spin_lock(&free_list
[isize
].lock
); 1929 insert_point
= free_list
[isize
].list
; 1933 tmp
->b_next_free
= insert_point
->b_next_free
; 1934 tmp
->b_prev_free
= insert_point
; 1935 insert_point
->b_next_free
->b_prev_free
= tmp
; 1936 insert_point
->b_next_free
= tmp
; 1938 tmp
->b_prev_free
= tmp
; 1939 tmp
->b_next_free
= tmp
; 1942 if(tmp
->b_this_page
) 1943 tmp
= tmp
->b_this_page
; 1947 tmp
->b_this_page
= bh
; 1948 free_list
[isize
].list
= bh
; 1949 spin_unlock(&free_list
[isize
].lock
); 1951 mem_map
[MAP_NR(page
)].buffers
= bh
; 1952 atomic_add(PAGE_SIZE
, &buffermem
); 1957 * Can the buffer be thrown out? 1959 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected)) 1960 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) 1963 * try_to_free_buffers() checks if all the buffers on this particular page 1964 * are unused, and free's the page if so. 1966 * Wake up bdflush() if this fails - if we're running low on memory due 1967 * to dirty buffers, we need to flush them out as quickly as possible. 1969 * NOTE: There are quite a number of ways that threads of control can 1970 * obtain a reference to a buffer head within a page. So we must 1971 * lock out all of these paths to cleanly toss the page. 1973 inttry_to_free_buffers(struct page
* page
) 1975 struct buffer_head
* tmp
, * bh
= page
->buffers
; 1976 int index
=BUFSIZE_INDEX(bh
->b_size
); 1979 spin_lock(&lru_list_lock
); 1980 write_lock(&hash_table_lock
); 1981 spin_lock(&free_list
[index
].lock
); 1984 struct buffer_head
* p
= tmp
; 1986 tmp
= tmp
->b_this_page
; 1988 goto busy_buffer_page
; 1991 spin_lock(&unused_list_lock
); 1994 struct buffer_head
* p
= tmp
; 1995 tmp
= tmp
->b_this_page
; 1997 /* The buffer can be either on the regular 1998 * queues or on the free list.. 2000 if(p
->b_dev
== B_FREE
) { 2001 __remove_from_free_list(p
, index
); 2005 __remove_from_lru_list(p
, p
->b_list
); 2007 __put_unused_buffer_head(p
); 2009 spin_unlock(&unused_list_lock
); 2011 /* Wake up anyone waiting for buffer heads */ 2012 wake_up(&buffer_wait
); 2014 /* And free the page */ 2015 page
->buffers
= NULL
; 2019 spin_unlock(&free_list
[index
].lock
); 2020 write_unlock(&hash_table_lock
); 2021 spin_unlock(&lru_list_lock
); 2025 /* Uhhuh, start writeback so that we don't end up with all dirty pages */ 2026 too_many_dirty_buffers
=1; 2032 /* ===================== Init ======================= */ 2035 * allocate the hash table and init the free list 2036 * Use gfp() for the hash table to decrease TLB misses, use 2037 * SLAB cache for buffer heads. 2039 void __init
buffer_init(unsigned long memory_size
) 2042 unsigned int nr_hash
; 2044 /* The buffer cache hash table is less important these days, 2048 memory_size
*=sizeof(struct buffer_head
*); 2049 for(order
=0; (PAGE_SIZE
<< order
) < memory_size
; order
++) 2052 /* try to allocate something until we get it or we're asking 2053 for something that is really too small */ 2058 nr_hash
= (PAGE_SIZE
<< order
) /sizeof(struct buffer_head
*); 2059 bh_hash_mask
= (nr_hash
-1); 2063 while((tmp
>>=1UL) !=0UL) 2066 hash_table
= (struct buffer_head
**) 2067 __get_free_pages(GFP_ATOMIC
, order
); 2068 }while(hash_table
== NULL
&& --order
>0); 2069 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", 2070 nr_hash
, order
, (1UL<<order
) * PAGE_SIZE
); 2073 panic("Failed to allocate buffer hash table\n"); 2075 /* Setup hash chains. */ 2076 for(i
=0; i
< nr_hash
; i
++) 2077 hash_table
[i
] = NULL
; 2079 /* Setup free lists. */ 2080 for(i
=0; i
< NR_SIZES
; i
++) { 2081 free_list
[i
].list
= NULL
; 2082 free_list
[i
].lock
= SPIN_LOCK_UNLOCKED
; 2085 /* Setup lru lists. */ 2086 for(i
=0; i
< NR_LIST
; i
++) 2089 bh_cachep
=kmem_cache_create("buffer_head", 2090 sizeof(struct buffer_head
), 2092 SLAB_HWCACHE_ALIGN
, NULL
, NULL
); 2094 panic("Cannot create buffer head SLAB cache\n"); 2098 /* ====================== bdflush support =================== */ 2100 /* This is a simple kernel daemon, whose job it is to provide a dynamic 2101 * response to dirty buffers. Once this process is activated, we write back 2102 * a limited number of buffers to the disks and then go back to sleep again. 2104 staticDECLARE_WAIT_QUEUE_HEAD(bdflush_wait
); 2105 staticDECLARE_WAIT_QUEUE_HEAD(bdflush_done
); 2106 struct task_struct
*bdflush_tsk
=0; 2108 voidwakeup_bdflush(int wait
) 2110 if(current
== bdflush_tsk
) 2113 run_task_queue(&tq_disk
); 2114 wake_up(&bdflush_wait
); 2116 sleep_on(&bdflush_done
); 2121 * Here we attempt to write back old buffers. We also try to flush inodes 2122 * and supers as well, since this function is essentially "update", and 2123 * otherwise there would be no way of ensuring that these quantities ever 2124 * get written back. Ideally, we would have a timestamp on the inodes 2125 * and superblocks so that we could write back only the old ones as well 2128 static intsync_old_buffers(void) 2137 for(nlist
= BUF_LOCKED
; nlist
<= BUF_DIRTY
; nlist
++) { 2138 struct buffer_head
*bh
; 2140 spin_lock(&lru_list_lock
); 2141 bh
= lru_list
[nlist
]; 2143 struct buffer_head
*next
; 2145 for(i
= nr_buffers_type
[nlist
]; i
-- >0; bh
= next
) { 2146 next
= bh
->b_next_free
; 2148 /* If the buffer is not on the proper list, 2151 if((nlist
== BUF_DIRTY
&& 2152 (!buffer_dirty(bh
) && !buffer_locked(bh
))) || 2153 (nlist
== BUF_LOCKED
&& !buffer_locked(bh
))) { 2154 __refile_buffer(bh
); 2158 if(buffer_locked(bh
) || !buffer_dirty(bh
)) 2161 /* OK, now we are committed to write it out. */ 2163 atomic_inc(&bh
->b_count
); 2164 spin_unlock(&lru_list_lock
); 2165 ll_rw_block(WRITE
,1, &bh
); 2166 atomic_dec(&bh
->b_count
); 2170 spin_unlock(&lru_list_lock
); 2172 run_task_queue(&tq_disk
); 2176 /* This is the interface to bdflush. As we get more sophisticated, we can 2177 * pass tuning parameters to this "process", to adjust how it behaves. 2178 * We would want to verify each parameter, however, to make sure that it 2181 asmlinkage
intsys_bdflush(int func
,long data
) 2183 if(!capable(CAP_SYS_ADMIN
)) 2188 struct mm_struct
*user_mm
; 2191 * bdflush will spend all of it's time in kernel-space, 2192 * without touching user-space, so we can switch it into 2193 * 'lazy TLB mode' to reduce the cost of context-switches 2194 * to and from bdflush. 2196 user_mm
=start_lazy_tlb(); 2197 error
=sync_old_buffers(); 2198 end_lazy_tlb(user_mm
); 2202 /* Basically func 1 means read param 1, 2 means write param 1, etc */ 2204 int i
= (func
-2) >>1; 2205 if(i
>=0&& i
< N_PARAM
) { 2207 returnput_user(bdf_prm
.data
[i
], (int*)data
); 2209 if(data
>= bdflush_min
[i
] && data
<= bdflush_max
[i
]) { 2210 bdf_prm
.data
[i
] = data
; 2217 /* Having func 0 used to launch the actual bdflush and then never 2218 * return (unless explicitly killed). We return zero here to 2219 * remain semi-compatible with present update(8) programs. 2225 * This is the actual bdflush daemon itself. It used to be started from 2226 * the syscall above, but now we launch it ourselves internally with 2227 * kernel_thread(...) directly after the first thread in init/main.c 2229 intbdflush(void* unused
) 2232 * We have a bare-bones task_struct, and really should fill 2233 * in a few more things so "top" and /proc/2/{exe,root,cwd} 2234 * display semi-sane things. Not real crucial though... 2237 current
->session
=1; 2239 sprintf(current
->comm
,"kflushd"); 2240 bdflush_tsk
= current
; 2245 CHECK_EMERGENCY_SYNC
2247 for(nlist
= BUF_LOCKED
; nlist
<= BUF_DIRTY
; nlist
++) { 2248 int nr
, major
, written
=0; 2249 struct buffer_head
*next
; 2252 spin_lock(&lru_list_lock
); 2253 next
= lru_list
[nlist
]; 2254 nr
= nr_buffers_type
[nlist
]; 2256 struct buffer_head
*bh
= next
; 2258 next
= next
->b_next_free
; 2260 /* If the buffer is not on the correct list, 2263 if((nlist
== BUF_DIRTY
&& 2264 (!buffer_dirty(bh
) && !buffer_locked(bh
))) || 2265 (nlist
== BUF_LOCKED
&& !buffer_locked(bh
))) { 2266 __refile_buffer(bh
); 2270 /* If we aren't in panic mode, don't write out too much 2271 * at a time. Also, don't write out buffers we don't 2272 * really have to write out yet.. 2274 if(!too_many_dirty_buffers
) { 2275 if(written
> bdf_prm
.b_un
.ndirty
) 2277 if(time_before(jiffies
, bh
->b_flushtime
)) 2281 if(buffer_locked(bh
) || !buffer_dirty(bh
)) 2284 major
=MAJOR(bh
->b_dev
); 2289 * For the loop major we can try to do asynchronous writes, 2290 * but we have to guarantee that we're making some progress.. 2292 atomic_inc(&bh
->b_count
); 2293 spin_unlock(&lru_list_lock
); 2294 if(major
== LOOP_MAJOR
&& written
>1) { 2295 ll_rw_block(WRITEA
,1, &bh
); 2296 if(buffer_dirty(bh
)) 2299 ll_rw_block(WRITE
,1, &bh
); 2300 atomic_dec(&bh
->b_count
); 2303 spin_unlock(&lru_list_lock
); 2305 run_task_queue(&tq_disk
); 2306 wake_up(&bdflush_done
); 2309 * If there are still a lot of dirty buffers around, 2310 * skip the sleep and flush some more. Otherwise, we 2311 * sleep for a while and mark us as not being in panic 2314 if(!too_many_dirty_buffers
|| nr_buffers_type
[BUF_DIRTY
] < bdf_prm
.b_un
.ndirty
) { 2315 too_many_dirty_buffers
=0; 2316 spin_lock_irq(¤t
->sigmask_lock
); 2317 flush_signals(current
); 2318 spin_unlock_irq(¤t
->sigmask_lock
); 2319 interruptible_sleep_on_timeout(&bdflush_wait
,5*HZ
); 2324 static int __init
bdflush_init(void) 2326 kernel_thread(bdflush
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
); 2330 module_init(bdflush_init
)