* - RMK
*/
+/* Thread it... -DaveM */
+
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/malloc.h>
@@ -57,30+59,40 @@ static char buffersize_index[65] = #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
number of unused buffer heads */
+/* Anti-deadlock ordering:
+ * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
+ */
+
/*
- * Hash table mask..
+ * Hash table gook..
*/
-static unsigned long bh_hash_mask = 0;
+static unsigned int bh_hash_mask = 0;
+static unsigned int bh_hash_shift = 0;
+static struct buffer_head **hash_table;
+static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
-static int grow_buffers(int size);
+static struct buffer_head *lru_list[NR_LIST];
+static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
+static int nr_buffers_type[NR_LIST] = {0,};
-static struct buffer_head ** hash_table;
-static struct buffer_head * lru_list[NR_LIST] = {NULL, };
-static struct buffer_head * free_list[NR_SIZES] = {NULL, };
+static struct buffer_head * unused_list = NULL;
+static int nr_unused_buffer_heads = 0;
+static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
-static kmem_cache_t *bh_cachep;
+struct bh_free_head {
+ struct buffer_head *list;
+ spinlock_t lock;
+};
+static struct bh_free_head free_list[NR_SIZES];
-static struct buffer_head * unused_list = NULL;
static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
-static int nr_buffers = 0;
-static int nr_buffers_type[NR_LIST] = {0,};
-static int nr_buffer_heads = 0;
-static int nr_unused_buffer_heads = 0;
-static int nr_hashed_buffers = 0;
+static kmem_cache_t *bh_cachep;
+
+static int grow_buffers(int size);
/* This is used by some architectures to estimate available memory. */
-int buffermem = 0;
+atomic_t buffermem = ATOMIC_INIT(0);
/* Here is the parameter block for the bdflush process. If you add or
* remove any of the parameters, make sure to update kernel/sysctl.c.
@@ -130,7+142,7 @@ void __wait_on_buffer(struct buffer_head * bh) struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
- bh->b_count++;
+ atomic_inc(&bh->b_count);
add_wait_queue(&bh->b_wait, &wait);
repeat:
tsk->state = TASK_UNINTERRUPTIBLE;
@@ -141,7+153,7 @@ repeat: }
tsk->state = TASK_RUNNING;
remove_wait_queue(&bh->b_wait, &wait);
- bh->b_count--;
+ atomic_dec(&bh->b_count);
}
/* Call sync_buffers with wait!=0 to ensure that the call does not
@@ -166,17+178,19 @@ static int sync_buffers(kdev_t dev, int wait) */
do {
retry = 0;
-repeat:
+
/* We search all lists as a failsafe mechanism, not because we expect
* there to be dirty buffers on any of the other lists.
*/
+repeat:
+ spin_lock(&lru_list_lock);
bh = lru_list[BUF_DIRTY];
if (!bh)
goto repeat2;
+
for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
- if (bh->b_list != BUF_DIRTY)
- goto repeat;
next = bh->b_next_free;
+
if (!lru_list[BUF_DIRTY])
break;
if (dev && bh->b_dev != dev)
@@ -189,7+203,10 @@ repeat: retry = 1;
continue;
}
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
wait_on_buffer (bh);
+ atomic_dec(&bh->b_count);
goto repeat;
}
@@ -208,30+225,24 @@ repeat: if (!buffer_dirty(bh) || pass >= 2)
continue;
- /* Don't bother about locked buffers.
- *
- * XXX We checked if it was locked above and there is no
- * XXX way we could have slept in between. -DaveM
- */
- if (buffer_locked(bh))
- continue;
- bh->b_count++;
- next->b_count++;
+ atomic_inc(&bh->b_count);
bh->b_flushtime = 0;
+ spin_unlock(&lru_list_lock);
ll_rw_block(WRITE, 1, &bh);
- bh->b_count--;
- next->b_count--;
+ atomic_dec(&bh->b_count);
retry = 1;
+ goto repeat;
}
repeat2:
bh = lru_list[BUF_LOCKED];
- if (!bh)
+ if (!bh) {
+ spin_unlock(&lru_list_lock);
break;
+ }
for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
- if (bh->b_list != BUF_LOCKED)
- goto repeat2;
next = bh->b_next_free;
+
if (!lru_list[BUF_LOCKED])
break;
if (dev && bh->b_dev != dev)
@@ -244,10+255,15 @@ repeat: retry = 1;
continue;
}
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
wait_on_buffer (bh);
+ spin_lock(&lru_list_lock);
+ atomic_dec(&bh->b_count);
goto repeat2;
}
}
+ spin_unlock(&lru_list_lock);
/* If we are waiting for the sync to succeed, and if any dirty
* blocks were written, then repeat; on the second pass, only
@@ -281,17+297,19 @@ void sync_dev(kdev_t dev) int fsync_dev(kdev_t dev)
{
sync_buffers(dev, 0);
+
+ lock_kernel();
sync_supers(dev);
sync_inodes(dev);
DQUOT_SYNC(dev);
+ unlock_kernel();
+
return sync_buffers(dev, 1);
}
asmlinkage int sys_sync(void)
{
- lock_kernel();
fsync_dev(0);
- unlock_kernel();
return 0;
}
void invalidate_buffers(kdev_t dev)
{
- int i;
int nlist;
- struct buffer_head * bh;
+ spin_lock(&lru_list_lock);
for(nlist = 0; nlist < NR_LIST; nlist++) {
+ struct buffer_head * bh;
+ int i;
+ retry:
bh = lru_list[nlist];
+ if (!bh)
+ continue;
for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
if (bh->b_dev != dev)
continue;
- wait_on_buffer(bh);
- if (bh->b_dev != dev)
- continue;
- if (bh->b_count)
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ spin_lock(&lru_list_lock);
+ atomic_dec(&bh->b_count);
+ goto retry;
+ }
+ if (atomic_read(&bh->b_count))
continue;
bh->b_flushtime = 0;
clear_bit(BH_Protected, &bh->b_state);
@@ -416,157+443,119 @@ void invalidate_buffers(kdev_t dev) clear_bit(BH_Req, &bh->b_state);
}
}
+ spin_unlock(&lru_list_lock);
}
-#define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
-#define hash(dev,block) hash_table[_hashfn(dev,block)]
+/* After several hours of tedious analysis, the following hash
+ * function won. Do not mess with it... -DaveM
+ */
+#define _hashfn(dev,block) \
+ ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
+ (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
+#define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
-static void insert_into_hash_list(struct buffer_head * bh)
+static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
{
- bh->b_next = NULL;
- bh->b_pprev = NULL;
- if (bh->b_dev) {
- struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
- struct buffer_head *next = *bhp;
-
- if (next) {
- bh->b_next = next;
- next->b_pprev = &bh->b_next;
- }
- *bhp = bh;
- bh->b_pprev = bhp;
- nr_hashed_buffers++;
- }
+ if ((bh->b_next = *head) != NULL)
+ bh->b_next->b_pprev = &bh->b_next;
+ *head = bh;
+ bh->b_pprev = head;
}
-static void remove_from_hash_queue(struct buffer_head * bh)
+static __inline__ void __hash_unlink(struct buffer_head *bh)
{
- struct buffer_head **pprev = bh->b_pprev;
- if (pprev) {
- struct buffer_head * next = bh->b_next;
- if (next) {
- next->b_pprev = pprev;
- bh->b_next = NULL;
- }
- *pprev = next;
- bh->b_pprev = NULL;
- nr_hashed_buffers--;
- }
+ if (bh->b_next)
+ bh->b_next->b_pprev = bh->b_pprev;
+ *(bh->b_pprev) = bh->b_next;
+ bh->b_pprev = NULL;
}
-static void insert_into_lru_list(struct buffer_head * bh)
+static void __insert_into_lru_list(struct buffer_head * bh, int blist)
{
- struct buffer_head **bhp = &lru_list[bh->b_list];
-
- if (bh->b_dev == B_FREE)
- BUG();
+ struct buffer_head **bhp = &lru_list[blist];
if(!*bhp) {
*bhp = bh;
bh->b_prev_free = bh;
}
-
- if (bh->b_next_free)
- panic("VFS: buffer LRU pointers corrupted");
-
bh->b_next_free = *bhp;
bh->b_prev_free = (*bhp)->b_prev_free;
(*bhp)->b_prev_free->b_next_free = bh;
(*bhp)->b_prev_free = bh;
-
- nr_buffers++;
- nr_buffers_type[bh->b_list]++;
+ nr_buffers_type[blist]++;
}
-static void remove_from_lru_list(struct buffer_head * bh)
+static void __remove_from_lru_list(struct buffer_head * bh, int blist)
{
- if (!(bh->b_prev_free) || !(bh->b_next_free))
- return;
-
- if (bh->b_dev == B_FREE) {
- printk("LRU list corrupted");
- *(int*)0 = 0;
+ if (bh->b_prev_free || bh->b_next_free) {
+ bh->b_prev_free->b_next_free = bh->b_next_free;
+ bh->b_next_free->b_prev_free = bh->b_prev_free;
+ if (lru_list[blist] == bh)
+ lru_list[blist] = bh->b_next_free;
+ if (lru_list[blist] == bh)
+ lru_list[blist] = NULL;
+ bh->b_next_free = bh->b_prev_free = NULL;
+ nr_buffers_type[blist]--;
}
- bh->b_prev_free->b_next_free = bh->b_next_free;
- bh->b_next_free->b_prev_free = bh->b_prev_free;
-
- if (lru_list[bh->b_list] == bh)
- lru_list[bh->b_list] = bh->b_next_free;
- if (lru_list[bh->b_list] == bh)
- lru_list[bh->b_list] = NULL;
- bh->b_next_free = bh->b_prev_free = NULL;
-
- nr_buffers--;
- nr_buffers_type[bh->b_list]--;
}
-static void remove_from_free_list(struct buffer_head * bh)
+static void __remove_from_free_list(struct buffer_head * bh, int index)
{
- int isize = BUFSIZE_INDEX(bh->b_size);
- if (!(bh->b_prev_free) || !(bh->b_next_free))
- panic("VFS: Free block list corrupted");
- if(bh->b_dev != B_FREE)
- panic("Free list corrupted");
- if(!free_list[isize])
- panic("Free list empty");
if(bh->b_next_free == bh)
- free_list[isize] = NULL;
+ free_list[index].list = NULL;
else {
bh->b_prev_free->b_next_free = bh->b_next_free;
bh->b_next_free->b_prev_free = bh->b_prev_free;
- if (free_list[isize] == bh)
- free_list[isize] = bh->b_next_free;
+ if (free_list[index].list == bh)
+ free_list[index].list = bh->b_next_free;
}
bh->b_next_free = bh->b_prev_free = NULL;
}
-static void remove_from_queues(struct buffer_head * bh)
+/* The following two functions must operate atomically
+ * because they control the visibility of a buffer head
+ * to the rest of the kernel.
+ */
+static __inline__ void __remove_from_queues(struct buffer_head *bh)
{
- if (bh->b_dev == B_FREE)
- BUG();
- remove_from_hash_queue(bh);
- remove_from_lru_list(bh);
+ write_lock(&hash_table_lock);
+ if (bh->b_pprev)
+ __hash_unlink(bh);
+ __remove_from_lru_list(bh, bh->b_list);
+ write_unlock(&hash_table_lock);
}
-static void put_last_free(struct buffer_head * bh)
+static void insert_into_queues(struct buffer_head *bh)
{
- if (bh) {
- struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
-
- if (bh->b_count)
- BUG();
-
- bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
-
- /* Add to back of free list. */
- if(!*bhp) {
- *bhp = bh;
- bh->b_prev_free = bh;
- }
-
- bh->b_next_free = *bhp;
- bh->b_prev_free = (*bhp)->b_prev_free;
- (*bhp)->b_prev_free->b_next_free = bh;
- (*bhp)->b_prev_free = bh;
- }
+ struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
+
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ __hash_link(bh, head);
+ __insert_into_lru_list(bh, bh->b_list);
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
}
-struct buffer_head * find_buffer(kdev_t dev, int block, int size)
-{
- struct buffer_head * next;
+/* This function must only run if there are no other
+ * references _anywhere_ to this buffer head.
+ */
+static void put_last_free(struct buffer_head * bh)
+{
+ struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
+ struct buffer_head **bhp = &head->list;
- next = hash(dev,block);
- for (;;) {
- struct buffer_head *tmp = next;
- if (!next)
- break;
- next = tmp->b_next;
- if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
- continue;
- next = tmp;
- break;
+ spin_lock(&head->lock);
+ bh->b_dev = B_FREE;
+ if(!*bhp) {
+ *bhp = bh;
+ bh->b_prev_free = bh;
}
- return next;
+ bh->b_next_free = *bhp;
+ bh->b_prev_free = (*bhp)->b_prev_free;
+ (*bhp)->b_prev_free->b_next_free = bh;
+ (*bhp)->b_prev_free = bh;
+ spin_unlock(&head->lock);
}
/*
@@ -578,10+567,19 @@ struct buffer_head * find_buffer(kdev_t dev, int block, int size) */
struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
{
- struct buffer_head * bh;
- bh = find_buffer(dev,block,size);
+ struct buffer_head **head = &hash(dev, block);
+ struct buffer_head *bh;
+
+ read_lock(&hash_table_lock);
+ for(bh = *head; bh; bh = bh->b_next)
+ if (bh->b_blocknr == block &&
+ bh->b_size == size &&
+ bh->b_dev == dev)
+ break;
if (bh)
- bh->b_count++;
+ atomic_inc(&bh->b_count);
+ read_unlock(&hash_table_lock);
+
return bh;
}
@@ -630,6+628,8 @@ void set_blocksize(kdev_t dev, int size) * around on the free list, and we can get in a loop if we are not careful.
*/
for(nlist = 0; nlist < NR_LIST; nlist++) {
+ repeat:
+ spin_lock(&lru_list_lock);
bh = lru_list[nlist];
for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
if(!bh)
@@ -640,21+640,25 @@ void set_blocksize(kdev_t dev, int size) continue;
if (bh->b_size == size)
continue;
- bhnext->b_count++;
- bh->b_count++;
- wait_on_buffer(bh);
- bhnext->b_count--;
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+ wait_on_buffer(bh);
+ atomic_dec(&bh->b_count);
+ goto repeat;
+ }
if (bh->b_dev == dev && bh->b_size != size) {
clear_bit(BH_Dirty, &bh->b_state);
clear_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Req, &bh->b_state);
bh->b_flushtime = 0;
}
- if (--bh->b_count)
- continue;
- remove_from_queues(bh);
- put_last_free(bh);
+ if (atomic_read(&bh->b_count) == 0) {
+ __remove_from_queues(bh);
+ put_last_free(bh);
+ }
}
+ spin_unlock(&lru_list_lock);
}
}
@@ -721,10+725,11 @@ static void end_buffer_io_async(struct buffer_head * bh, int uptodate) */
spin_lock_irqsave(&page_uptodate_lock, flags);
unlock_buffer(bh);
- bh->b_count--;
+ atomic_dec(&bh->b_count);
tmp = bh->b_this_page;
while (tmp != bh) {
- if (tmp->b_count && (tmp->b_end_io == end_buffer_io_async))
+ if (atomic_read(&tmp->b_count) &&
+ (tmp->b_end_io == end_buffer_io_async))
goto still_busy;
tmp = tmp->b_this_page;
}
@@ -794,24+799,26 @@ repeat: }
isize = BUFSIZE_INDEX(size);
-get_free:
- bh = free_list[isize];
+ spin_lock(&free_list[isize].lock);
+ bh = free_list[isize].list;
+ if (bh) {
+ __remove_from_free_list(bh, isize);
+ atomic_set(&bh->b_count, 1);
+ }
+ spin_unlock(&free_list[isize].lock);
if (!bh)
goto refill;
- remove_from_free_list(bh);
/* OK, FINALLY we know that this buffer is the only one of its kind,
- * and that it's unused (b_count=0), unlocked, and clean.
+ * we hold a reference (b_count>0), it is unlocked, and it is clean.
*/
init_buffer(bh, end_buffer_io_sync, NULL);
bh->b_dev = dev;
bh->b_blocknr = block;
- bh->b_count = 1;
bh->b_state = 1 << BH_Mapped;
/* Insert the buffer into the regular lists */
- insert_into_lru_list(bh);
- insert_into_hash_list(bh);
+ insert_into_queues(bh);
goto out;
/*
@@ -820,24+827,12 @@ get_free: */
refill:
refill_freelist(size);
- if (!find_buffer(dev,block,size))
- goto get_free;
goto repeat;
out:
return bh;
}
/*
- * Put a buffer into the appropriate list, without side-effects.
- */
-static void file_buffer(struct buffer_head *bh, int list)
-{
- remove_from_lru_list(bh);
- bh->b_list = list;
- insert_into_lru_list(bh);
-}
-
-/*
* if a new dirty buffer is created we need to balance bdflush.
*
* in the future we might want to make bdflush aware of different
@@ -875,34+870,29 @@ void __mark_buffer_dirty(struct buffer_head *bh, int flag) __mark_dirty(bh, flag);
}
-void __atomic_mark_buffer_dirty(struct buffer_head *bh, int flag)
-{
- lock_kernel();
- __mark_dirty(bh, flag);
- unlock_kernel();
-}
-
/*
* A buffer may need to be moved from one buffer list to another
* (e.g. in case it is not shared any more). Handle this.
*/
-void refile_buffer(struct buffer_head * buf)
+static __inline__ void __refile_buffer(struct buffer_head *bh)
{
- int dispose;
-
- if (buf->b_dev == B_FREE) {
- printk("Attempt to refile free buffer\n");
- return;
- }
-
- dispose = BUF_CLEAN;
- if (buffer_locked(buf))
+ int dispose = BUF_CLEAN;
+ if (buffer_locked(bh))
dispose = BUF_LOCKED;
- if (buffer_dirty(buf))
+ if (buffer_dirty(bh))
dispose = BUF_DIRTY;
+ if (dispose != bh->b_list) {
+ __remove_from_lru_list(bh, bh->b_list);
+ bh->b_list = dispose;
+ __insert_into_lru_list(bh, dispose);
+ }
+}
- if (dispose != buf->b_list)
- file_buffer(buf, dispose);
+void refile_buffer(struct buffer_head *bh)
+{
+ spin_lock(&lru_list_lock);
+ __refile_buffer(bh);
+ spin_unlock(&lru_list_lock);
}
/*
@@ -912,8+902,8 @@ void __brelse(struct buffer_head * buf) {
touch_buffer(buf);
- if (buf->b_count) {
- buf->b_count--;
+ if (atomic_read(&buf->b_count)) {
+ atomic_dec(&buf->b_count);
wake_up(&buffer_wait);
return;
}
@@ -928,14+918,22 @@ void __brelse(struct buffer_head * buf) */
void __bforget(struct buffer_head * buf)
{
- if (buf->b_count != 1 || buffer_locked(buf)) {
- __brelse(buf);
- return;
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
+ touch_buffer(buf);
+ atomic_dec(&buf->b_count);
+ wake_up(&buffer_wait);
+ } else {
+ atomic_set(&buf->b_count, 0);
+ buf->b_state = 0;
+ if (buf->b_pprev)
+ __hash_unlink(buf);
+ __remove_from_lru_list(buf, buf->b_list);
+ put_last_free(buf);
}
- buf->b_count = 0;
- buf->b_state = 0;
- remove_from_queues(buf);
- put_last_free(buf);
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
}
/*
@@ -1025,21+1023,25 @@ struct buffer_head * breada(kdev_t dev, int block, int bufsize, /*
* Note: the caller should wake up the buffer_wait list if needed.
*/
-static void put_unused_buffer_head(struct buffer_head * bh)
+static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
{
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
- nr_buffer_heads--;
kmem_cache_free(bh_cachep, bh);
- return;
+ } else {
+ bh->b_blocknr = -1;
+ init_waitqueue_head(&bh->b_wait);
+ nr_unused_buffer_heads++;
+ bh->b_next_free = unused_list;
+ bh->b_this_page = NULL;
+ unused_list = bh;
}
+}
-// memset(bh, 0, sizeof(*bh));
- bh->b_blocknr = -1;
- init_waitqueue_head(&bh->b_wait);
- nr_unused_buffer_heads++;
- bh->b_next_free = unused_list;
- bh->b_this_page = NULL;
- unused_list = bh;
+static void put_unused_buffer_head(struct buffer_head *bh)
+{
+ spin_lock(&unused_list_lock);
+ __put_unused_buffer_head(bh);
+ spin_unlock(&unused_list_lock);
}
/*
@@ -1051,12+1053,15 @@ static struct buffer_head * get_unused_buffer_head(int async) {
struct buffer_head * bh;
+ spin_lock(&unused_list_lock);
if (nr_unused_buffer_heads > NR_RESERVED) {
bh = unused_list;
unused_list = bh->b_next_free;
nr_unused_buffer_heads--;
+ spin_unlock(&unused_list_lock);
return bh;
}
+ spin_unlock(&unused_list_lock);
/* This is critical. We can't swap out pages to get
* more buffer heads, because the swap-out may need
@@ -1065,20+1070,23 @@ static struct buffer_head * get_unused_buffer_head(int async) if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
memset(bh, 0, sizeof(*bh));
init_waitqueue_head(&bh->b_wait);
- nr_buffer_heads++;
return bh;
}
/*
* If we need an async buffer, use the reserved buffer heads.
*/
- if (async && unused_list) {
- bh = unused_list;
- unused_list = bh->b_next_free;
- nr_unused_buffer_heads--;
- return bh;
+ if (async) {
+ spin_lock(&unused_list_lock);
+ if (unused_list) {
+ bh = unused_list;
+ unused_list = bh->b_next_free;
+ nr_unused_buffer_heads--;
+ spin_unlock(&unused_list_lock);
+ return bh;
+ }
+ spin_unlock(&unused_list_lock);
}
-
#if 0
/*
* (Pending further analysis ...)
@@ -1090,7+1098,6 @@ static struct buffer_head * get_unused_buffer_head(int async) (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
memset(bh, 0, sizeof(*bh));
init_waitqueue_head(&bh->b_wait);
- nr_buffer_heads++;
return bh;
}
#endif
@@ -1127,7+1134,8 @@ try_again:
bh->b_state = 0;
bh->b_next_free = NULL;
- bh->b_count = 0;
+ bh->b_pprev = NULL;
+ atomic_set(&bh->b_count, 0);
bh->b_size = size;
bh->b_data = (char *) (page+offset);
@@ -1197,9+1205,7 @@ static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], i * They show up in the buffer hash table and are registered in
* page->buffers.
*/
- lock_kernel();
head = create_buffers(page_address(page), size, 1);
- unlock_kernel();
if (page->buffers)
BUG();
if (!head)
@@ -1248,7+1254,6 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset BUG();
if (!page->buffers)
return 0;
- lock_kernel();
head = page->buffers;
bh = head;
@@ -1261,7+1266,7 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset */
if (offset <= curr_off) {
if (buffer_mapped(bh)) {
- bh->b_count++;
+ atomic_inc(&bh->b_count);
wait_on_buffer(bh);
if (bh->b_dev == B_FREE)
BUG();
@@ -1269,7+1274,7 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset clear_bit(BH_Uptodate, &bh->b_state);
clear_bit(BH_Mapped, &bh->b_state);
bh->b_blocknr = 0;
- bh->b_count--;
+ atomic_dec(&bh->b_count);
}
}
curr_off = next_off;
@@ -1288,10+1293,9 @@ int block_flushpage(struct inode *inode, struct page *page, unsigned long offset */
if (!offset) {
if (!try_to_free_buffers(page))
- buffermem += PAGE_CACHE_SIZE;
+ atomic_add(PAGE_CACHE_SIZE, &buffermem);
}
- unlock_kernel();
return 0;
}
@@ -1299,9+1303,7 @@ static void create_empty_buffers(struct page *page, struct inode *inode, unsigne {
struct buffer_head *bh, *head, *tail;
- lock_kernel();
head = create_buffers(page_address(page), blocksize, 1);
- unlock_kernel();
if (page->buffers)
BUG();
@@ -1365,7+1367,7 @@ int block_write_full_page(struct file *file, struct page *page) goto out;
}
set_bit(BH_Uptodate, &bh->b_state);
- atomic_mark_buffer_dirty(bh,0);
+ mark_buffer_dirty(bh,0);
bh = bh->b_this_page;
block++;
@@ -1458,10+1460,8 @@ int block_write_partial_page(struct file *file, struct page *page, unsigned long if (buffer_new(bh)) {
memset(bh->b_data, 0, bh->b_size);
} else {
- lock_kernel();
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
- unlock_kernel();
err = -EIO;
if (!buffer_uptodate(bh))
goto out;
@@ -1498,11+1498,9 @@ int block_write_partial_page(struct file *file, struct page *page, unsigned long */
set_bit(BH_Uptodate, &bh->b_state);
if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
- lock_kernel();
__mark_dirty(bh, 0);
if (too_many_dirty_buffers)
balance_dirty(bh->b_dev);
- unlock_kernel();
}
if (err) {
@@ -1569,7+1567,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) do {
block = *(b++);
- if (fresh && (bh->b_count != 0))
+ if (fresh && (atomic_read(&bh->b_count) != 0))
BUG();
if (rw == READ) {
if (!fresh)
@@ -1582,7+1580,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) BUG();
if (!buffer_uptodate(bh)) {
arr[nr++] = bh;
- bh->b_count++;
+ atomic_inc(&bh->b_count);
}
}
} else { /* WRITE */
@@ -1597,7+1595,7 @@ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap) set_bit(BH_Uptodate, &bh->b_state);
set_bit(BH_Dirty, &bh->b_state);
arr[nr++] = bh;
- bh->b_count++;
+ atomic_inc(&bh->b_count);
}
bh = bh->b_this_page;
} while (bh != head);
@@ -1663,7+1661,7 @@ int block_read_full_page(struct file * file, struct page * page) }
init_buffer(bh, end_buffer_io_async, NULL);
- bh->b_count++;
+ atomic_inc(&bh->b_count);
arr[nr] = bh;
nr++;
} while (iblock++, (bh = bh->b_this_page) != head);
@@ -1710,8+1708,9 @@ static int grow_buffers(int size) }
isize = BUFSIZE_INDEX(size);
- insert_point = free_list[isize];
+ spin_lock(&free_list[isize].lock);
+ insert_point = free_list[isize].list;
tmp = bh;
while (1) {
if (insert_point) {
@@ -1730,9+1729,11 @@ static int grow_buffers(int size) break;
}
tmp->b_this_page = bh;
- free_list[isize] = bh;
+ free_list[isize].list = bh;
+ spin_unlock(&free_list[isize].lock);
+
mem_map[MAP_NR(page)].buffers = bh;
- buffermem += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &buffermem);
return 1;
}
@@ -1740,7+1741,7 @@ static int grow_buffers(int size) * Can the buffer be thrown out?
*/
#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh) ((bh)->b_count | ((bh)->b_state & BUFFER_BUSY_BITS))
+#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
/*
* try_to_free_buffers() checks if all the buffers on this particular page
@@ -1748,11+1749,20 @@ static int grow_buffers(int size) *
* Wake up bdflush() if this fails - if we're running low on memory due
* to dirty buffers, we need to flush them out as quickly as possible.
+ *
+ * NOTE: There are quite a number of ways that threads of control can
+ * obtain a reference to a buffer head within a page. So we must
+ * lock out all of these paths to cleanly toss the page.
*/
int try_to_free_buffers(struct page * page)
{
struct buffer_head * tmp, * bh = page->buffers;
+ int index = BUFSIZE_INDEX(bh->b_size);
+ int ret;
+ spin_lock(&lru_list_lock);
+ write_lock(&hash_table_lock);
+ spin_lock(&free_list[index].lock);
tmp = bh;
do {
struct buffer_head * p = tmp;
@@ -1762,19+1772,25 @@ int try_to_free_buffers(struct page * page) goto busy_buffer_page;
} while (tmp != bh);
+ spin_lock(&unused_list_lock);
tmp = bh;
do {
struct buffer_head * p = tmp;
tmp = tmp->b_this_page;
- /* The buffer can be either on the regular queues or on the free list.. */
- if (p->b_dev == B_FREE)
- remove_from_free_list(p);
- else
- remove_from_queues(p);
-
- put_unused_buffer_head(p);
+ /* The buffer can be either on the regular
+ * queues or on the free list..
+ */
+ if (p->b_dev == B_FREE) {
+ __remove_from_free_list(p, index);
+ } else {
+ if (p->b_pprev)
+ __hash_unlink(p);
+ __remove_from_lru_list(p, p->b_list);
+ }
+ __put_unused_buffer_head(p);
} while (tmp != bh);
+ spin_unlock(&unused_list_lock);
/* Wake up anyone waiting for buffer heads */
wake_up(&buffer_wait);
@@ -1782,55+1798,21 @@ int try_to_free_buffers(struct page * page) /* And free the page */
page->buffers = NULL;
__free_page(page);
- return 1;
+ ret = 1;
+out:
+ spin_unlock(&free_list[index].lock);
+ write_unlock(&hash_table_lock);
+ spin_unlock(&lru_list_lock);
+ return ret;
busy_buffer_page:
/* Uhhuh, start writeback so that we don't end up with all dirty pages */
too_many_dirty_buffers = 1;
wakeup_bdflush(0);
- return 0;
-}
-
-/* ================== Debugging =================== */
-
-void show_buffers(void)
-{
- struct buffer_head * bh;
- int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
- int protected = 0;
- int nlist;
- static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
-
- printk("Buffer memory: %6dkB\n",buffermem>>10);
- printk("Buffer heads: %6d\n",nr_buffer_heads);
- printk("Buffer blocks: %6d\n",nr_buffers);
- printk("Buffer hashed: %6d\n",nr_hashed_buffers);
-
- for(nlist = 0; nlist < NR_LIST; nlist++) {
- found = locked = dirty = used = lastused = protected = 0;
- bh = lru_list[nlist];
- if(!bh) continue;
-
- do {
- found++;
- if (buffer_locked(bh))
- locked++;
- if (buffer_protected(bh))
- protected++;
- if (buffer_dirty(bh))
- dirty++;
- if (bh->b_count)
- used++, lastused = found;
- bh = bh->b_next_free;
- } while (bh != lru_list[nlist]);
- printk("%8s: %d buffers, %d used (last=%d), "
- "%d locked, %d protected, %d dirty\n",
- buf_types[nlist], found, used, lastused,
- locked, protected, dirty);
- };
+ ret = 0;
+ goto out;
}
-
/* ===================== Init ======================= */
/*
@@ -1840,31+1822,46 @@ void show_buffers(void) */
void __init buffer_init(unsigned long memory_size)
{
- int order;
+ int order, i;
unsigned int nr_hash;
- /* we need to guess at the right sort of size for a buffer cache.
- the heuristic from working with large databases and getting
- fsync times (ext2) manageable, is the following */
-
- memory_size >>= 22;
- for (order = 5; (1UL << order) < memory_size; order++);
+ /* The buffer cache hash table is less important these days,
+ * trim it a bit.
+ */
+ memory_size >>= 14;
+ memory_size *= sizeof(struct buffer_head *);
+ for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
+ ;
/* try to allocate something until we get it or we're asking
for something that is really too small */
do {
- nr_hash = (1UL << order) * PAGE_SIZE /
- sizeof(struct buffer_head *);
+ nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
+ bh_hash_mask = (nr_hash - 1);
+ bh_hash_shift = (PAGE_SHIFT + order);
hash_table = (struct buffer_head **)
__get_free_pages(GFP_ATOMIC, order);
- } while (hash_table == NULL && --order > 4);
- printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (1UL<<order) * PAGE_SIZE);
-
+ } while (hash_table == NULL && --order > 0);
+ printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
+ nr_hash, order, (1UL<<order) * PAGE_SIZE);
+
if (!hash_table)
panic("Failed to allocate buffer hash table\n");
- memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
- bh_hash_mask = nr_hash-1;
+
+ /* Setup hash chains. */
+ for(i = 0; i < nr_hash; i++)
+ hash_table[i] = NULL;
+
+ /* Setup free lists. */
+ for(i = 0; i < NR_SIZES; i++) {
+ free_list[i].list = NULL;
+ free_list[i].lock = SPIN_LOCK_UNLOCKED;
+ }
+
+ /* Setup lru lists. */
+ for(i = 0; i < NR_LIST; i++)
+ lru_list[i] = NULL;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head),
@@ -1872,21+1869,6 @@ void __init buffer_init(unsigned long memory_size) SLAB_HWCACHE_ALIGN, NULL, NULL);
if(!bh_cachep)
panic("Cannot create buffer head SLAB cache\n");
- /*
- * Allocate the reserved buffer heads.
- */
- while (nr_buffer_heads < NR_RESERVED) {
- struct buffer_head * bh;
-
- bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
- if (!bh)
- break;
- put_unused_buffer_head(bh);
- nr_buffer_heads++;
- }
-
- lru_list[BUF_CLEAN] = 0;
- grow_buffers(BLOCK_SIZE);
}
@@ -1922,70+1904,49 @@ void wakeup_bdflush(int wait)
static int sync_old_buffers(void)
{
- int i;
- int ndirty, nwritten;
int nlist;
- int ncount;
- struct buffer_head * bh, *next;
+ lock_kernel();
sync_supers(0);
sync_inodes(0);
+ unlock_kernel();
- ncount = 0;
-#ifdef DEBUG
- for(nlist = 0; nlist < NR_LIST; nlist++)
-#else
- for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
-#endif
- {
- ndirty = 0;
- nwritten = 0;
+ for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
+ struct buffer_head *bh;
repeat:
-
+ spin_lock(&lru_list_lock);
bh = lru_list[nlist];
- if(bh)
- for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
- /* We may have stalled while waiting for I/O to complete. */
- if(bh->b_list != nlist) goto repeat;
- next = bh->b_next_free;
- if(!lru_list[nlist]) {
- printk("Dirty list empty %d\n", i);
- break;
- }
-
- /* Clean buffer on dirty list? Refile it */
- if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
- refile_buffer(bh);
- continue;
- }
-
- /* Unlocked buffer on locked list? Refile it */
- if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
- refile_buffer(bh);
- continue;
- }
+ if(bh) {
+ struct buffer_head *next;
+ int i;
+ for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
+ next = bh->b_next_free;
+
+ /* If the buffer is not on the proper list,
+ * then refile it.
+ */
+ if ((nlist == BUF_DIRTY &&
+ (!buffer_dirty(bh) && !buffer_locked(bh))) ||
+ (nlist == BUF_LOCKED && !buffer_locked(bh))) {
+ __refile_buffer(bh);
+ continue;
+ }
- if (buffer_locked(bh) || !buffer_dirty(bh))
- continue;
- ndirty++;
- nwritten++;
- next->b_count++;
- bh->b_count++;
- bh->b_flushtime = 0;
-#ifdef DEBUG
- if(nlist != BUF_DIRTY) ncount++;
-#endif
- ll_rw_block(WRITE, 1, &bh);
- bh->b_count--;
- next->b_count--;
- }
+ if (buffer_locked(bh) || !buffer_dirty(bh))
+ continue;
+
+ /* OK, now we are committed to write it out. */
+ bh->b_flushtime = 0;
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
+ ll_rw_block(WRITE, 1, &bh);
+ atomic_dec(&bh->b_count);
+ goto repeat;
+ }
+ }
+ spin_unlock(&lru_list_lock);
}
run_task_queue(&tq_disk);
-#ifdef DEBUG
- if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
- printk("Wrote %d/%d buffers\n", nwritten, ndirty);
-#endif
- run_task_queue(&tq_disk);
return 0;
}
@@ -1999,7+1960,6 @@ asmlinkage int sys_bdflush(int func, long data) {
int i, error = -EPERM;
- lock_kernel();
if (!capable(CAP_SYS_ADMIN))
goto out;
@@ -2031,7+1991,6 @@ asmlinkage int sys_bdflush(int func, long data) */
error = 0;
out:
- unlock_kernel();
return error;
}
@@ -2053,52+2012,37 @@ int bdflush(void * unused) sprintf(current->comm, "kflushd");
bdflush_tsk = current;
- /*
- * As a kernel thread we want to tamper with system buffers
- * and other internals and thus be subject to the SMP locking
- * rules. (On a uniprocessor box this does nothing).
- */
- lock_kernel();
-
for (;;) {
int nlist;
CHECK_EMERGENCY_SYNC
- for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
- {
- int nr;
- int written = 0;
+ for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
+ int nr, major, written = 0;
struct buffer_head *next;
- int major;
repeat:
+ spin_lock(&lru_list_lock);
next = lru_list[nlist];
nr = nr_buffers_type[nlist];
-
while (nr-- > 0) {
struct buffer_head *bh = next;
- /* We may have stalled while waiting for I/O to complete. */
- if (next->b_list != nlist)
- goto repeat;
+
next = next->b_next_free;
- /* Clean buffer on dirty list? Refile it */
- if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
- refile_buffer(bh);
- continue;
- }
-
- /* Unlocked buffer on locked list? Refile it */
- if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
- refile_buffer(bh);
+ /* If the buffer is not on the correct list,
+ * then refile it.
+ */
+ if ((nlist == BUF_DIRTY &&
+ (!buffer_dirty(bh) && !buffer_locked(bh))) ||
+ (nlist == BUF_LOCKED && !buffer_locked(bh))) {
+ __refile_buffer(bh);
continue;
}
- /*
- * If we aren't in panic mode, don't write out too much
- * at a time. Also, don't write out buffers we don't really
- * have to write out yet..
+ /* If we aren't in panic mode, don't write out too much
+ * at a time. Also, don't write out buffers we don't
+ * really have to write out yet..
*/
if (!too_many_dirty_buffers) {
if (written > bdf_prm.b_un.ndirty)
@@ -2111,9+2055,6 @@ int bdflush(void * unused) continue;
major = MAJOR(bh->b_dev);
- if (next)
- next->b_count++;
- bh->b_count++;
written++;
bh->b_flushtime = 0;
@@ -2121,18+2062,19 @@ int bdflush(void * unused) * For the loop major we can try to do asynchronous writes,
* but we have to guarantee that we're making some progress..
*/
+ atomic_inc(&bh->b_count);
+ spin_unlock(&lru_list_lock);
if (major == LOOP_MAJOR && written > 1) {
ll_rw_block(WRITEA, 1, &bh);
if (buffer_dirty(bh))
--written;
} else
ll_rw_block(WRITE, 1, &bh);
-
- bh->b_count--;
- if (next)
- next->b_count--;
wake_up(&buffer_wait);
+ atomic_dec(&bh->b_count);
+ goto repeat;
}
+ spin_unlock(&lru_list_lock);
}
run_task_queue(&tq_disk);
wake_up(&bdflush_done);