@@ -63,7+63,6 @@ unset CONFIG_ALPHA_T2 CONFIG_ALPHA_PYXIS CONFIG_ALPHA_POLARIS unset CONFIG_ALPHA_TSUNAMI CONFIG_ALPHA_MCPCIA
unset CONFIG_ALPHA_IRONGATE
unset CONFIG_ALPHA_BROKEN_IRQ_MASK
-unset CONFIG_ALPHA_LARGE_VMALLOC
# Most of these machines have ISA slots; not exactly sure which don't,
# and this doesn't activate hordes of code, so do it always.
@@ -215,6+214,8 @@ if [ "$CONFIG_ALPHA_GENERIC" = "y" -o "$CONFIG_ALPHA_DP264" = "y" \ -o "$CONFIG_ALPHA_WILDFIRE" = "y" -o "$CONFIG_ALPHA_TITAN" = "y" ]
then
bool 'Large VMALLOC support' CONFIG_ALPHA_LARGE_VMALLOC
+else
+ define_bool CONFIG_ALPHA_LARGE_VMALLOC n
fi
source drivers/pci/Config.in
@@ -160,15+160,20 @@ EXPORT_SYMBOL_NOVERS(__do_clear_user); EXPORT_SYMBOL(__strncpy_from_user);
EXPORT_SYMBOL(__strnlen_user);
-/*
- * The following are specially called from the semaphore assembly stubs.
- */
-EXPORT_SYMBOL_NOVERS(__down_failed);
-EXPORT_SYMBOL_NOVERS(__down_failed_interruptible);
-EXPORT_SYMBOL_NOVERS(__up_wakeup);
-EXPORT_SYMBOL_NOVERS(__down_read_failed);
-EXPORT_SYMBOL_NOVERS(__down_write_failed);
-EXPORT_SYMBOL_NOVERS(__rwsem_wake);
+/* Semaphore helper functions. */
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__up_wakeup);
+EXPORT_SYMBOL(down);
+EXPORT_SYMBOL(down_interruptible);
+EXPORT_SYMBOL(up);
+EXPORT_SYMBOL(__down_read_failed);
+EXPORT_SYMBOL(__down_write_failed);
+EXPORT_SYMBOL(__rwsem_wake);
+EXPORT_SYMBOL(down_read);
+EXPORT_SYMBOL(down_write);
+EXPORT_SYMBOL(up_read);
+EXPORT_SYMBOL(up_write);
/*
* SMP-specific symbols.
-/*
- * Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/sched.h>
-#include <asm/semaphore-helper.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit. ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore. The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-void
-__up(struct semaphore *sem)
-{
- wake_one_more(sem);
- wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function. Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible. This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return. If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-#define DOWN_VAR \
- struct task_struct *tsk = current; \
- wait_queue_t wait; \
- init_waitqueue_entry(&wait, tsk)
-
-#define DOWN_HEAD(task_state) \
- \
- \
- tsk->state = (task_state); \
- add_wait_queue(&sem->wait, &wait); \
- \
- /* \
- * Ok, we're set up. sem->count is known to be less than zero \
- * so we must wait. \
- * \
- * We can let go the lock for purposes of waiting. \
- * We re-acquire it after awaking so as to protect \
- * all semaphore operations. \
- * \
- * If "up()" is called before we call waking_non_zero() then \
- * we will catch it right away. If it is called later then \
- * we will have to go through a wakeup cycle to catch it. \
- * \
- * Multiple waiters contend for the semaphore lock to see \
- * who gets to gate through and who has to wait some more. \
- */ \
- for (;;) {
-
-#define DOWN_TAIL(task_state) \
- tsk->state = (task_state); \
- } \
- tsk->state = TASK_RUNNING; \
- remove_wait_queue(&sem->wait, &wait)
-
-void
-__down(struct semaphore * sem)
-{
- DOWN_VAR;
- DOWN_HEAD(TASK_UNINTERRUPTIBLE);
-
- if (waking_non_zero(sem))
- break;
- schedule();
-
- DOWN_TAIL(TASK_UNINTERRUPTIBLE);
-}
-
-int
-__down_interruptible(struct semaphore * sem)
-{
- int ret = 0;
- DOWN_VAR;
- DOWN_HEAD(TASK_INTERRUPTIBLE);
-
- ret = waking_non_zero_interruptible(sem, tsk);
- if (ret)
- {
- if (ret == 1)
- /* ret != 0 only if we get interrupted -arca */
- ret = 0;
- break;
- }
- schedule();
-
- DOWN_TAIL(TASK_INTERRUPTIBLE);
- return ret;
-}
-
-int
-__down_trylock(struct semaphore * sem)
-{
- return waking_non_zero_trylock(sem);
-}
-
-
-/*
- * RW Semaphores
- */
-
-void
-__down_read(struct rw_semaphore *sem, int count)
-{
- long tmp;
- DOWN_VAR;
-
- retry_down:
- if (count < 0) {
- /* Wait for the lock to become unbiased. Readers
- are non-exclusive. */
-
- /* This takes care of granting the lock. */
- up_read(sem);
-
- add_wait_queue(&sem->wait, &wait);
- while (sem->count < 0) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if (sem->count >= 0)
- break;
- schedule();
- }
-
- remove_wait_queue(&sem->wait, &wait);
- tsk->state = TASK_RUNNING;
-
- __asm __volatile (
- " mb\n"
- "1: ldl_l %0,%1\n"
- " subl %0,1,%2\n"
- " subl %0,1,%0\n"
- " stl_c %2,%1\n"
- " bne %2,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- : "=r"(count), "=m"(sem->count), "=r"(tmp)
- : : "memory");
- if (count <= 0)
- goto retry_down;
- } else {
- add_wait_queue(&sem->wait, &wait);
-
- while (1) {
- if (test_and_clear_bit(0, &sem->granted))
- break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if ((sem->granted & 1) == 0)
- schedule();
- }
-
- remove_wait_queue(&sem->wait, &wait);
- tsk->state = TASK_RUNNING;
- }
-}
-
-void
-__down_write(struct rw_semaphore *sem, int count)
-{
- long tmp;
- DOWN_VAR;
-
- retry_down:
- if (count + RW_LOCK_BIAS < 0) {
- up_write(sem);
-
- add_wait_queue_exclusive(&sem->wait, &wait);
-
- while (sem->count < 0) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if (sem->count >= RW_LOCK_BIAS)
- break;
- schedule();
- }
-
- remove_wait_queue(&sem->wait, &wait);
- tsk->state = TASK_RUNNING;
-
- __asm __volatile (
- " mb\n"
- "1: ldl_l %0,%1\n"
- " ldah %2,%3(%0)\n"
- " ldah %0,%3(%0)\n"
- " stl_c %2,%1\n"
- " bne %2,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- : "=r"(count), "=m"(sem->count), "=r"(tmp)
- : "i"(-(RW_LOCK_BIAS >> 16))
- : "memory");
- if (count != 0)
- goto retry_down;
- } else {
- /* Put ourselves at the end of the list. */
- add_wait_queue_exclusive(&sem->write_bias_wait, &wait);
-
- while (1) {
- if (test_and_clear_bit(1, &sem->granted))
- break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if ((sem->granted & 2) == 0)
- schedule();
- }
-
- remove_wait_queue(&sem->write_bias_wait, &wait);
- tsk->state = TASK_RUNNING;
-
- /* If the lock is currently unbiased, awaken the sleepers.
- FIXME: This wakes up the readers early in a bit of a
- stampede -> bad! */
- if (sem->count >= 0)
- wake_up(&sem->wait);
- }
-}
-
-void
-__do_rwsem_wake(struct rw_semaphore *sem, int readers)
-{
- if (readers) {
- if (test_and_set_bit(0, &sem->granted))
- BUG();
- wake_up(&sem->wait);
- } else {
- if (test_and_set_bit(1, &sem->granted))
- BUG();
- wake_up(&sem->write_bias_wait);
- }
-}
+/*
+ * Alpha semaphore implementation.
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ * (C) Copyright 1999, 2000 Richard Henderson
+ */
+
+#include <linux/sched.h>
+
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ *
+ * The "count" variable is decremented for each process that tries to sleep,
+ * while the "waking" variable is incremented when the "up()" code goes to
+ * wake up waiting processes.
+ *
+ * Notably, the inline "up()" and "down()" functions can efficiently test
+ * if they need to do any extra work (up needs to do something only if count
+ * was negative before the increment operation.
+ *
+ * waking_non_zero() (from asm/semaphore.h) must execute atomically.
+ *
+ * When __up() is called, the count was negative before incrementing it,
+ * and we need to wake up somebody.
+ *
+ * This routine adds one to the count of processes that need to wake up and
+ * exit. ALL waiting processes actually wake up but only the one that gets
+ * to the "waking" field first will gate through and acquire the semaphore.
+ * The others will go back to sleep.
+ *
+ * Note that these functions are only called when there is contention on the
+ * lock, and as such all this is the "non-critical" part of the whole
+ * semaphore business. The critical part is the inline stuff in
+ * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
+ */
+
+/*
+ * Perform the "down" function. Return zero for semaphore acquired,
+ * return negative for signalled out of the function.
+ *
+ * If called from down, the return is ignored and the wait loop is
+ * not interruptible. This means that a task waiting on a semaphore
+ * using "down()" cannot be killed until someone does an "up()" on
+ * the semaphore.
+ *
+ * If called from down_interruptible, the return value gets checked
+ * upon return. If the return value is negative then the task continues
+ * with the negative value in the return register (it can be tested by
+ * the caller).
+ *
+ * Either form may be used in conjunction with "up()".
+ */
+
+void
+__down_failed(struct semaphore *sem)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down failed(%p)\n",
+ current->comm, current->pid, sem);
+#endif
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ wmb();
+ add_wait_queue_exclusive(&sem->wait, &wait);
+
+ /* At this point we know that sem->count is negative. In order
+ to avoid racing with __up, we must check for wakeup before
+ going to sleep the first time. */
+
+ while (1) {
+ long ret, tmp;
+
+ /* An atomic conditional decrement of sem->waking. */
+ __asm__ __volatile__(
+ "1: ldl_l %1,%2\n"
+ " blt %1,2f\n"
+ " subl %1,1,%0\n"
+ " stl_c %0,%2\n"
+ " beq %0,3f\n"
+ "2:\n"
+ ".subsection 2\n"
+ "3: br 1b\n"
+ ".previous"
+ : "=r"(ret), "=&r"(tmp), "=m"(sem->waking)
+ : "0"(0));
+
+ if (ret)
+ break;
+
+ schedule();
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&sem->wait, &wait);
+ current->state = TASK_RUNNING;
+
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down acquired(%p)\n",
+ current->comm, current->pid, sem);
+#endif
+}
+
+int
+__down_failed_interruptible(struct semaphore *sem)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ long ret;
+
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down failed(%p)\n",
+ current->comm, current->pid, sem);
+#endif
+
+ current->state = TASK_INTERRUPTIBLE;
+ wmb();
+ add_wait_queue_exclusive(&sem->wait, &wait);
+
+ while (1) {
+ long tmp, tmp2, tmp3;
+
+ /* We must undo the sem->count down_interruptible decrement
+ simultaneously and atomicly with the sem->waking
+ adjustment, otherwise we can race with __up. This is
+ accomplished by doing a 64-bit ll/sc on two 32-bit words.
+
+ "Equivalent" C. Note that we have to do this all without
+ (taken) branches in order to be a valid ll/sc sequence.
+
+ do {
+ tmp = ldq_l;
+ ret = 0;
+ if (tmp >= 0) { // waking >= 0
+ tmp += 0xffffffff00000000; // waking -= 1
+ ret = 1;
+ }
+ else if (pending) {
+ // count += 1, but since -1 + 1 carries into the
+ // high word, we have to be more careful here.
+ tmp = (tmp & 0xffffffff00000000)
+ | ((tmp + 1) & 0x00000000ffffffff);
+ ret = -EINTR;
+ }
+ tmp = stq_c = tmp;
+ } while (tmp == 0);
+ */
+
+ __asm__ __volatile__(
+ "1: ldq_l %1,%4\n"
+ " lda %0,0\n"
+ " cmovne %5,%6,%0\n"
+ " addq %1,1,%2\n"
+ " and %1,%7,%3\n"
+ " andnot %2,%7,%2\n"
+ " cmovge %1,1,%0\n"
+ " or %3,%2,%2\n"
+ " addq %1,%7,%3\n"
+ " cmovne %5,%2,%1\n"
+ " cmovge %2,%3,%1\n"
+ " stq_c %1,%4\n"
+ " beq %1,3f\n"
+ "2:\n"
+ ".subsection 2\n"
+ "3: br 1b\n"
+ ".previous"
+ : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2),
+ "=&r"(tmp3), "=m"(*sem)
+ : "r"(signal_pending(current)), "r"(-EINTR),
+ "r"(0xffffffff00000000));
+
+ /* At this point we have ret
+ 1 got the lock
+ 0 go to sleep
+ -EINTR interrupted */
+ if (ret != 0)
+ break;
+
+ schedule();
+ set_task_state(current, TASK_INTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&sem->wait, &wait);
+ current->state = TASK_RUNNING;
+ wake_up(&sem->wait);
+
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down %s(%p)\n",
+ current->comm, current->pid,
+ (ret < 0 ? "interrupted" : "acquired"), sem);
+#endif
+
+ /* Convert "got the lock" to 0==success. */
+ return (ret < 0 ? ret : 0);
+}
+
+void
+__up_wakeup(struct semaphore *sem)
+{
+ wake_up(&sem->wait);
+}
+
+void
+down(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down(%p) <count=%d> from %p\n",
+ current->comm, current->pid, sem,
+ atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+ __down(sem);
+}
+
+int
+down_interruptible(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down(%p) <count=%d> from %p\n",
+ current->comm, current->pid, sem,
+ atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+ return __down_interruptible(sem);
+}
+
+int
+down_trylock(struct semaphore *sem)
+{
+ int ret;
+
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+#endif
+
+ ret = __down_trylock(sem);
+
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): down_trylock %s from %p\n",
+ current->comm, current->pid,
+ ret ? "failed" : "acquired",
+ __builtin_return_address(0));
+#endif
+
+ return ret;
+}
+
+void
+up(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+ printk("%s(%d): up(%p) <count=%d> from %p\n",
+ current->comm, current->pid, sem,
+ atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+ __up(sem);
+}
+
+
+/*
+ * RW Semaphores
+ */
+
+void
+__down_read_failed(struct rw_semaphore *sem, int count)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ retry_down:
+ if (count < 0) {
+ /* Waiting on multiple readers and/or writers. */
+
+ /* Undo the acquisition we started in down_read. */
+ atomic_inc(&sem->count);
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ wmb();
+ add_wait_queue(&sem->wait, &wait);
+ mb();
+ while (atomic_read(&sem->count) < 0) {
+ schedule();
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&sem->wait, &wait);
+ current->state = TASK_RUNNING;
+
+ mb();
+ count = atomic_dec_return(&sem->count);
+ if (count <= 0)
+ goto retry_down;
+ } else {
+ /* Waiting on exactly one writer. */
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ wmb();
+ add_wait_queue(&sem->wait, &wait);
+ mb();
+
+ while (!test_and_clear_bit(0, &sem->granted)) {
+ schedule();
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&sem->wait, &wait);
+ current->state = TASK_RUNNING;
+ }
+}
+
+void
+__down_write_failed(struct rw_semaphore *sem, int count)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ retry_down:
+ if (count + RW_LOCK_BIAS < 0) {
+ /* Waiting on multiple readers and/or writers. */
+
+ /* Undo the acquisition we started in down_write. */
+ atomic_add(RW_LOCK_BIAS, &sem->count);
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ wmb();
+ add_wait_queue_exclusive(&sem->wait, &wait);
+ mb();
+
+ while (atomic_read(&sem->count) + RW_LOCK_BIAS < 0) {
+ schedule();
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&sem->wait, &wait);
+ current->state = TASK_RUNNING;
+
+ count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
+ if (count != 0)
+ goto retry_down;
+ } else {
+ /* Waiting on exactly one writer. */
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ wmb();
+ add_wait_queue_exclusive(&sem->wait, &wait);
+ mb();
+
+ while (!test_and_clear_bit(1, &sem->granted)) {
+ schedule();
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&sem->write_bias_wait, &wait);
+ current->state = TASK_RUNNING;
+
+ /* If the lock is currently unbiased, awaken the sleepers.
+ FIXME: This wakes up the readers early in a bit of a
+ stampede -> bad! */
+ count = atomic_read(&sem->count);
+ if (__builtin_expect(count >= 0, 0))
+ wake_up(&sem->wait);
+ }
+}
+
+void
+__rwsem_wake(struct rw_semaphore *sem, int readers)
+{
+ if (readers) {
+ if (test_and_set_bit(0, &sem->granted))
+ BUG();
+ wake_up(&sem->wait);
+ } else {
+ if (test_and_set_bit(1, &sem->granted))
+ BUG();
+ wake_up(&sem->write_bias_wait);
+ }
+}
+
+void
+down_read(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+#endif
+ __down_read(sem);
+#if WAITQUEUE_DEBUG
+ if (sem->granted & 2)
+ BUG();
+ if (atomic_read(&sem->writers))
+ BUG();
+ atomic_inc(&sem->readers);
+#endif
+}
+
+void
+down_write(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+#endif
+ __down_write(sem);
+#if WAITQUEUE_DEBUG
+ if (sem->granted & 3)
+ BUG();
+ if (atomic_read(&sem->writers))
+ BUG();
+ if (atomic_read(&sem->readers))
+ BUG();
+ atomic_inc(&sem->writers);
+#endif
+}
+
+void
+up_read(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+ if (sem->granted & 2)
+ BUG();
+ if (atomic_read(&sem->writers))
+ BUG();
+ atomic_dec(&sem->readers);
+#endif
+ __up_read(sem);
+}
+
+void
+up_write(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(sem->__magic);
+ if (sem->granted & 3)
+ BUG();
+ if (atomic_read(&sem->readers))
+ BUG();
+ if (atomic_read(&sem->writers) != 1)
+ BUG();
+ atomic_dec(&sem->writers);
+#endif
+ __up_write(sem);
+}
@@ -378,6+378,9 @@ do_settimeofday(struct timeval *tv) * BUG: This routine does not handle hour overflow properly; it just
* sets the minutes. Usually you won't notice until after reboot!
*/
+
+extern int abs(int);
+
static int
set_rtc_mmss(unsigned long nowtime)
{
@@ -12,7+12,7 @@ OBJS = __divqu.o __remqu.o __divlu.o __remlu.o memset.o memcpy.o io.o \ strcat.o strcpy.o strncat.o strncpy.o stxcpy.o stxncpy.o \
strchr.o strrchr.o memchr.o \
copy_user.o clear_user.o strncpy_from_user.o strlen_user.o \
- csum_ipv6_magic.o strcasecmp.o semaphore.o fpreg.o \
+ csum_ipv6_magic.o strcasecmp.o fpreg.o \
callback_srm.o srm_puts.o srm_printk.o
lib.a: $(OBJS)
+++ /dev/null
-/*
- * linux/arch/alpha/lib/semaphore.S
- *
- * Copyright (C) 1999, 2000 Richard Henderson
- */
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- */
-
- .set noat
- .set noreorder
- .align 4
-
-/* __down_failed takes the semaphore in $24, clobbers $24 and $28. */
-
- .globl __down_failed
- .ent __down_failed
-__down_failed:
- ldgp $29,0($27)
- lda $30, -20*8($30)
- stq $28, 0*8($30)
- stq $0, 1*8($30)
- stq $1, 2*8($30)
- stq $2, 3*8($30)
- stq $3, 4*8($30)
- stq $4, 5*8($30)
- stq $5, 6*8($30)
- stq $6, 7*8($30)
- stq $7, 8*8($30)
- stq $16, 9*8($30)
- stq $17, 10*8($30)
- stq $18, 11*8($30)
- stq $19, 12*8($30)
- stq $20, 13*8($30)
- stq $21, 14*8($30)
- stq $22, 15*8($30)
- stq $23, 16*8($30)
- stq $25, 17*8($30)
- stq $26, 18*8($30)
- .frame $30, 20*8, $28
- .prologue 1
-
- mov $24, $16
- jsr __down
-
- ldq $28, 0*8($30)
- ldq $0, 1*8($30)
- ldq $1, 2*8($30)
- ldq $2, 3*8($30)
- ldq $3, 4*8($30)
- ldq $4, 5*8($30)
- ldq $5, 6*8($30)
- ldq $6, 7*8($30)
- ldq $7, 8*8($30)
- ldq $16, 9*8($30)
- ldq $17, 10*8($30)
- ldq $18, 11*8($30)
- ldq $19, 12*8($30)
- ldq $20, 13*8($30)
- ldq $21, 14*8($30)
- ldq $22, 15*8($30)
- ldq $23, 16*8($30)
- ldq $25, 17*8($30)
- ldq $26, 18*8($30)
- lda $30, 20*8($30)
- ret $31, ($28), 0
- .end __down_failed
-
-/* __down_failed_interruptible takes the semaphore in $24,
- clobbers $28, returns success in $24. */
-
- .globl __down_failed_interruptible
- .ent __down_failed_interruptible
-__down_failed_interruptible:
- ldgp $29,0($27)
- lda $30, -20*8($30)
- stq $28, 0*8($30)
- stq $0, 1*8($30)
- stq $1, 2*8($30)
- stq $2, 3*8($30)
- stq $3, 4*8($30)
- stq $4, 5*8($30)
- stq $5, 6*8($30)
- stq $6, 7*8($30)
- stq $7, 8*8($30)
- stq $16, 9*8($30)
- stq $17, 10*8($30)
- stq $18, 11*8($30)
- stq $19, 12*8($30)
- stq $20, 13*8($30)
- stq $21, 14*8($30)
- stq $22, 15*8($30)
- stq $23, 16*8($30)
- stq $25, 17*8($30)
- stq $26, 18*8($30)
- .frame $30, 20*8, $28
- .prologue 1
-
- mov $24, $16
- jsr __down_interruptible
- mov $0, $24
-
- ldq $28, 0*8($30)
- ldq $0, 1*8($30)
- ldq $1, 2*8($30)
- ldq $2, 3*8($30)
- ldq $3, 4*8($30)
- ldq $4, 5*8($30)
- ldq $5, 6*8($30)
- ldq $6, 7*8($30)
- ldq $7, 8*8($30)
- ldq $16, 9*8($30)
- ldq $17, 10*8($30)
- ldq $18, 11*8($30)
- ldq $19, 12*8($30)
- ldq $20, 13*8($30)
- ldq $21, 14*8($30)
- ldq $22, 15*8($30)
- ldq $23, 16*8($30)
- ldq $25, 17*8($30)
- ldq $26, 18*8($30)
- lda $30, 20*8($30)
- ret $31, ($28), 0
- .end __down_failed_interruptible
-
-/* __up_wakeup takes the semaphore in $24, clobbers $24 and $28. */
-
- .globl __up_wakeup
- .ent __up_wakeup
-__up_wakeup:
- ldgp $29,0($27)
- lda $30, -20*8($30)
- stq $28, 0*8($30)
- stq $0, 1*8($30)
- stq $1, 2*8($30)
- stq $2, 3*8($30)
- stq $3, 4*8($30)
- stq $4, 5*8($30)
- stq $5, 6*8($30)
- stq $6, 7*8($30)
- stq $7, 8*8($30)
- stq $16, 9*8($30)
- stq $17, 10*8($30)
- stq $18, 11*8($30)
- stq $19, 12*8($30)
- stq $20, 13*8($30)
- stq $21, 14*8($30)
- stq $22, 15*8($30)
- stq $23, 16*8($30)
- stq $25, 17*8($30)
- stq $26, 18*8($30)
- .frame $30, 20*8, $28
- .prologue 1
-
- mov $24, $16
- jsr __up
-
- ldq $28, 0*8($30)
- ldq $0, 1*8($30)
- ldq $1, 2*8($30)
- ldq $2, 3*8($30)
- ldq $3, 4*8($30)
- ldq $4, 5*8($30)
- ldq $5, 6*8($30)
- ldq $6, 7*8($30)
- ldq $7, 8*8($30)
- ldq $16, 9*8($30)
- ldq $17, 10*8($30)
- ldq $18, 11*8($30)
- ldq $19, 12*8($30)
- ldq $20, 13*8($30)
- ldq $21, 14*8($30)
- ldq $22, 15*8($30)
- ldq $23, 16*8($30)
- ldq $25, 17*8($30)
- ldq $26, 18*8($30)
- lda $30, 20*8($30)
- ret $31, ($28), 0
- .end __up_wakeup
-
-/* __down_read_failed takes the semaphore in $24, count in $25;
- clobbers $24, $25 and $28. */
-
- .globl __down_read_failed
- .ent __down_read_failed
-__down_read_failed:
- ldgp $29,0($27)
- lda $30, -18*8($30)
- stq $28, 0*8($30)
- stq $0, 1*8($30)
- stq $1, 2*8($30)
- stq $2, 3*8($30)
- stq $3, 4*8($30)
- stq $4, 5*8($30)
- stq $5, 6*8($30)
- stq $6, 7*8($30)
- stq $7, 8*8($30)
- stq $16, 9*8($30)
- stq $17, 10*8($30)
- stq $18, 11*8($30)
- stq $19, 12*8($30)
- stq $20, 13*8($30)
- stq $21, 14*8($30)
- stq $22, 15*8($30)
- stq $23, 16*8($30)
- stq $26, 17*8($30)
- .frame $30, 18*8, $28
- .prologue 1
-
- mov $24, $16
- mov $25, $17
- jsr __down_read
-
- ldq $28, 0*8($30)
- ldq $0, 1*8($30)
- ldq $1, 2*8($30)
- ldq $2, 3*8($30)
- ldq $3, 4*8($30)
- ldq $4, 5*8($30)
- ldq $5, 6*8($30)
- ldq $6, 7*8($30)
- ldq $7, 8*8($30)
- ldq $16, 9*8($30)
- ldq $17, 10*8($30)
- ldq $18, 11*8($30)
- ldq $19, 12*8($30)
- ldq $20, 13*8($30)
- ldq $21, 14*8($30)
- ldq $22, 15*8($30)
- ldq $23, 16*8($30)
- ldq $26, 17*8($30)
- lda $30, 18*8($30)
- ret $31, ($28), 0
- .end __down_read_failed
-
-/* __down_write_failed takes the semaphore in $24, count in $25;
- clobbers $24, $25 and $28. */
-
- .globl __down_write_failed
- .ent __down_write_failed
-__down_write_failed:
- ldgp $29,0($27)
- lda $30, -20*8($30)
- stq $28, 0*8($30)
- stq $0, 1*8($30)
- stq $1, 2*8($30)
- stq $2, 3*8($30)
- stq $3, 4*8($30)
- stq $4, 5*8($30)
- stq $5, 6*8($30)
- stq $6, 7*8($30)
- stq $7, 8*8($30)
- stq $16, 9*8($30)
- stq $17, 10*8($30)
- stq $18, 11*8($30)
- stq $19, 12*8($30)
- stq $20, 13*8($30)
- stq $21, 14*8($30)
- stq $22, 15*8($30)
- stq $23, 16*8($30)
- stq $26, 17*8($30)
- .frame $30, 18*8, $28
- .prologue 1
-
- mov $24, $16
- mov $25, $17
- jsr __down_write
-
- ldq $28, 0*8($30)
- ldq $0, 1*8($30)
- ldq $1, 2*8($30)
- ldq $2, 3*8($30)
- ldq $3, 4*8($30)
- ldq $4, 5*8($30)
- ldq $5, 6*8($30)
- ldq $6, 7*8($30)
- ldq $7, 8*8($30)
- ldq $16, 9*8($30)
- ldq $17, 10*8($30)
- ldq $18, 11*8($30)
- ldq $19, 12*8($30)
- ldq $20, 13*8($30)
- ldq $21, 14*8($30)
- ldq $22, 15*8($30)
- ldq $23, 16*8($30)
- ldq $26, 17*8($30)
- lda $30, 18*8($30)
- ret $31, ($28), 0
- .end __down_write_failed
-
-/* __rwsem_wake takes the semaphore in $24, readers in $25;
- clobbers $24, $25, and $28. */
-
- .globl __rwsem_wake
- .ent __rwsem_wake
-__rwsem_wake:
- ldgp $29,0($27)
- lda $30, -18*8($30)
- stq $28, 0*8($30)
- stq $0, 1*8($30)
- stq $1, 2*8($30)
- stq $2, 3*8($30)
- stq $3, 4*8($30)
- stq $4, 5*8($30)
- stq $5, 6*8($30)
- stq $6, 7*8($30)
- stq $7, 8*8($30)
- stq $16, 9*8($30)
- stq $17, 10*8($30)
- stq $18, 11*8($30)
- stq $19, 12*8($30)
- stq $20, 13*8($30)
- stq $21, 14*8($30)
- stq $22, 15*8($30)
- stq $23, 16*8($30)
- stq $26, 17*8($30)
- .frame $30, 18*8, $28
- .prologue 1
-
- mov $24, $16
- mov $25, $17
- jsr __do_rwsem_wake
-
- ldq $28, 0*8($30)
- ldq $0, 1*8($30)
- ldq $1, 2*8($30)
- ldq $2, 3*8($30)
- ldq $3, 4*8($30)
- ldq $4, 5*8($30)
- ldq $5, 6*8($30)
- ldq $6, 7*8($30)
- ldq $7, 8*8($30)
- ldq $16, 9*8($30)
- ldq $17, 10*8($30)
- ldq $18, 11*8($30)
- ldq $19, 12*8($30)
- ldq $20, 13*8($30)
- ldq $21, 14*8($30)
- ldq $22, 15*8($30)
- ldq $23, 16*8($30)
- ldq $26, 17*8($30)
- lda $30, 18*8($30)
- ret $31, ($28), 0
- .end __rwsem_wake
@@ -2344,18+2344,7 @@ static mdk_personality_t raid5_personality=
int raid5_init (void)
{
- int err;
-
- err = register_md_personality (RAID5, &raid5_personality);
- if (err)
- return err;
-
- /*
- * pick a XOR routine, runtime.
- */
- calibrate_xor_block();
-
- return 0;
+ return register_md_personality (RAID5, &raid5_personality);
}
#ifdef MODULE
-/*
- * xor.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
- *
- *
- * optimized RAID-5 checksumming functions.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/config.h>
-#define BH_TRACE 0
-#include <linux/module.h>
-#include <linux/raid/md.h>
-#ifdef __sparc_v9__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/visasm.h>
-#endif
-
-/*
- * we use the 'XOR function template' to register multiple xor
- * functions runtime. The kernel measures their speed upon bootup
- * and decides which one to use. (compile-time registration is
- * not enough as certain CPU features like MMX can only be detected
- * runtime)
- *
- * this architecture makes it pretty easy to add new routines
- * that are faster on certain CPUs, without killing other CPU's
- * 'native' routine. Although the current routines are belived
- * to be the physically fastest ones on all CPUs tested, but
- * feel free to prove me wrong and add yet another routine =B-)
- * --mingo
- */
-
-#define MAX_XOR_BLOCKS 5
-
-#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
-
-typedef void (*xor_block_t) XOR_ARGS;
-xor_block_t xor_block = NULL;
-
-#ifndef __sparc_v9__
-
-struct xor_block_template;
-
-struct xor_block_template {
- char * name;
- xor_block_t xor_block;
- int speed;
- struct xor_block_template * next;
-};
-
-struct xor_block_template * xor_functions = NULL;
-
-#define XORBLOCK_TEMPLATE(x) \
-static void xor_block_##x XOR_ARGS; \
-static struct xor_block_template t_xor_block_##x = \
- { #x, xor_block_##x, 0, NULL }; \
-static void xor_block_##x XOR_ARGS
-
-#ifdef __i386__
-
-#ifdef CONFIG_X86_XMM
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-XORBLOCK_TEMPLATE(pIII_kni)
-{
- char xmm_save[16*4];
- int cr0;
- int lines = (bh_ptr[0]->b_size>>8);
-
- __asm__ __volatile__ (
- "movl %%cr0,%0 ;\n\t"
- "clts ;\n\t"
- "movups %%xmm0,(%1) ;\n\t"
- "movups %%xmm1,0x10(%1) ;\n\t"
- "movups %%xmm2,0x20(%1) ;\n\t"
- "movups %%xmm3,0x30(%1) ;\n\t"
- : "=r" (cr0)
- : "r" (xmm_save)
- : "memory" );
-
-#define OFFS(x) "8*("#x"*2)"
-#define PF0(x) \
- " prefetcht0 "OFFS(x)"(%1) ;\n"
-#define LD(x,y) \
- " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
-#define ST(x,y) \
- " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
-#define PF1(x) \
- " prefetchnta "OFFS(x)"(%2) ;\n"
-#define PF2(x) \
- " prefetchnta "OFFS(x)"(%3) ;\n"
-#define PF3(x) \
- " prefetchnta "OFFS(x)"(%4) ;\n"
-#define PF4(x) \
- " prefetchnta "OFFS(x)"(%5) ;\n"
-#define PF5(x) \
- " prefetchnta "OFFS(x)"(%6) ;\n"
-#define XO1(x,y) \
- " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
-#define XO2(x,y) \
- " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
-#define XO3(x,y) \
- " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
-#define XO4(x,y) \
- " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
-#define XO5(x,y) \
- " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
-
- switch(count) {
- case 2:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- LD(i,0) \
- LD(i+1,1) \
- PF1(i) \
- PF1(i+2) \
- LD(i+2,2) \
- LD(i+3,3) \
- PF0(i+4) \
- PF0(i+6) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- ST(i,0) \
- ST(i+1,1) \
- ST(i+2,2) \
- ST(i+3,3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data)
- : "memory" );
- break;
- case 3:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i+2) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- PF2(i) \
- PF2(i+2) \
- PF0(i+4) \
- PF0(i+6) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- XO2(i,0) \
- XO2(i+1,1) \
- XO2(i+2,2) \
- XO2(i+3,3) \
- ST(i,0) \
- ST(i+1,1) \
- ST(i+2,2) \
- ST(i+3,3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " addl $256, %3 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data)
- : "memory" );
- break;
- case 4:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i+2) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- PF2(i) \
- PF2(i+2) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- PF3(i) \
- PF3(i+2) \
- PF0(i+4) \
- PF0(i+6) \
- XO2(i,0) \
- XO2(i+1,1) \
- XO2(i+2,2) \
- XO2(i+3,3) \
- XO3(i,0) \
- XO3(i+1,1) \
- XO3(i+2,2) \
- XO3(i+3,3) \
- ST(i,0) \
- ST(i+1,1) \
- ST(i+2,2) \
- ST(i+3,3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " addl $256, %3 ;\n"
- " addl $256, %4 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data),
- "r" (bh_ptr[3]->b_data)
- : "memory" );
- break;
- case 5:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i+2) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- PF2(i) \
- PF2(i+2) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- PF3(i) \
- PF3(i+2) \
- XO2(i,0) \
- XO2(i+1,1) \
- XO2(i+2,2) \
- XO2(i+3,3) \
- PF4(i) \
- PF4(i+2) \
- PF0(i+4) \
- PF0(i+6) \
- XO3(i,0) \
- XO3(i+1,1) \
- XO3(i+2,2) \
- XO3(i+3,3) \
- XO4(i,0) \
- XO4(i+1,1) \
- XO4(i+2,2) \
- XO4(i+3,3) \
- ST(i,0) \
- ST(i+1,1) \
- ST(i+2,2) \
- ST(i+3,3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " addl $256, %3 ;\n"
- " addl $256, %4 ;\n"
- " addl $256, %5 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data),
- "r" (bh_ptr[3]->b_data),
- "r" (bh_ptr[4]->b_data)
- : "memory");
- break;
- }
-
- __asm__ __volatile__ (
- "sfence ;\n\t"
- "movups (%1),%%xmm0 ;\n\t"
- "movups 0x10(%1),%%xmm1 ;\n\t"
- "movups 0x20(%1),%%xmm2 ;\n\t"
- "movups 0x30(%1),%%xmm3 ;\n\t"
- "movl %0,%%cr0 ;\n\t"
- :
- : "r" (cr0), "r" (xmm_save)
- : "memory" );
-}
-
-#undef OFFS
-#undef LD
-#undef ST
-#undef PF0
-#undef PF1
-#undef PF2
-#undef PF3
-#undef PF4
-#undef PF5
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef XO5
-#undef BLOCK
-
-#endif /* CONFIG_X86_XMM */
-
-/*
- * high-speed RAID5 checksumming functions utilizing MMX instructions
- * Copyright (C) 1998 Ingo Molnar
- */
-XORBLOCK_TEMPLATE(pII_mmx)
-{
- char fpu_save[108];
- int lines = (bh_ptr[0]->b_size>>7);
-
- if (!(current->flags & PF_USEDFPU))
- __asm__ __volatile__ ( " clts;\n");
-
- __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-#define LD(x,y) \
- " movq 8*("#x")(%1), %%mm"#y" ;\n"
-#define ST(x,y) \
- " movq %%mm"#y", 8*("#x")(%1) ;\n"
-#define XO1(x,y) \
- " pxor 8*("#x")(%2), %%mm"#y" ;\n"
-#define XO2(x,y) \
- " pxor 8*("#x")(%3), %%mm"#y" ;\n"
-#define XO3(x,y) \
- " pxor 8*("#x")(%4), %%mm"#y" ;\n"
-#define XO4(x,y) \
- " pxor 8*("#x")(%5), %%mm"#y" ;\n"
-
- switch(count) {
- case 2:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- XO1(i,0) \
- ST(i,0) \
- XO1(i+1,1) \
- ST(i+1,1) \
- XO1(i+2,2) \
- ST(i+2,2) \
- XO1(i+3,3) \
- ST(i+3,3)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $128, %1 ;\n"
- " addl $128, %2 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data)
- : "memory");
- break;
- case 3:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- XO2(i,0) \
- ST(i,0) \
- XO2(i+1,1) \
- ST(i+1,1) \
- XO2(i+2,2) \
- ST(i+2,2) \
- XO2(i+3,3) \
- ST(i+3,3)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $128, %1 ;\n"
- " addl $128, %2 ;\n"
- " addl $128, %3 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data)
- : "memory");
- break;
- case 4:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- XO2(i,0) \
- XO2(i+1,1) \
- XO2(i+2,2) \
- XO2(i+3,3) \
- XO3(i,0) \
- ST(i,0) \
- XO3(i+1,1) \
- ST(i+1,1) \
- XO3(i+2,2) \
- ST(i+2,2) \
- XO3(i+3,3) \
- ST(i+3,3)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $128, %1 ;\n"
- " addl $128, %2 ;\n"
- " addl $128, %3 ;\n"
- " addl $128, %4 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data),
- "r" (bh_ptr[3]->b_data)
- : "memory");
- break;
- case 5:
- __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
- LD(i,0) \
- LD(i+1,1) \
- LD(i+2,2) \
- LD(i+3,3) \
- XO1(i,0) \
- XO1(i+1,1) \
- XO1(i+2,2) \
- XO1(i+3,3) \
- XO2(i,0) \
- XO2(i+1,1) \
- XO2(i+2,2) \
- XO2(i+3,3) \
- XO3(i,0) \
- XO3(i+1,1) \
- XO3(i+2,2) \
- XO3(i+3,3) \
- XO4(i,0) \
- ST(i,0) \
- XO4(i+1,1) \
- ST(i+1,1) \
- XO4(i+2,2) \
- ST(i+2,2) \
- XO4(i+3,3) \
- ST(i+3,3)
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $128, %1 ;\n"
- " addl $128, %2 ;\n"
- " addl $128, %3 ;\n"
- " addl $128, %4 ;\n"
- " addl $128, %5 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- :
- : "g" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data),
- "r" (bh_ptr[3]->b_data),
- "r" (bh_ptr[4]->b_data)
- : "memory");
- break;
- }
-
- __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
- if (!(current->flags & PF_USEDFPU))
- stts();
-}
-
-#undef LD
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef ST
-#undef BLOCK
-
-XORBLOCK_TEMPLATE(p5_mmx)
-{
- char fpu_save[108];
- int lines = (bh_ptr[0]->b_size>>6);
-
- if (!(current->flags & PF_USEDFPU))
- __asm__ __volatile__ ( " clts;\n");
-
- __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
- switch(count) {
- case 2:
- __asm__ __volatile__ (
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
- " movq (%1), %%mm0 ;\n"
- " movq 8(%1), %%mm1 ;\n"
- " pxor (%2), %%mm0 ;\n"
- " movq 16(%1), %%mm2 ;\n"
- " movq %%mm0, (%1) ;\n"
- " pxor 8(%2), %%mm1 ;\n"
- " movq 24(%1), %%mm3 ;\n"
- " movq %%mm1, 8(%1) ;\n"
- " pxor 16(%2), %%mm2 ;\n"
- " movq 32(%1), %%mm4 ;\n"
- " movq %%mm2, 16(%1) ;\n"
- " pxor 24(%2), %%mm3 ;\n"
- " movq 40(%1), %%mm5 ;\n"
- " movq %%mm3, 24(%1) ;\n"
- " pxor 32(%2), %%mm4 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " pxor 40(%2), %%mm5 ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
-
- " addl $64, %1 ;\n"
- " addl $64, %2 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data)
- : "memory" );
- break;
- case 3:
- __asm__ __volatile__ (
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
- " movq (%1), %%mm0 ;\n"
- " movq 8(%1), %%mm1 ;\n"
- " pxor (%2), %%mm0 ;\n"
- " movq 16(%1), %%mm2 ;\n"
- " pxor 8(%2), %%mm1 ;\n"
- " pxor (%3), %%mm0 ;\n"
- " pxor 16(%2), %%mm2 ;\n"
- " movq %%mm0, (%1) ;\n"
- " pxor 8(%3), %%mm1 ;\n"
- " pxor 16(%3), %%mm2 ;\n"
- " movq 24(%1), %%mm3 ;\n"
- " movq %%mm1, 8(%1) ;\n"
- " movq 32(%1), %%mm4 ;\n"
- " movq 40(%1), %%mm5 ;\n"
- " pxor 24(%2), %%mm3 ;\n"
- " movq %%mm2, 16(%1) ;\n"
- " pxor 32(%2), %%mm4 ;\n"
- " pxor 24(%3), %%mm3 ;\n"
- " pxor 40(%2), %%mm5 ;\n"
- " movq %%mm3, 24(%1) ;\n"
- " pxor 32(%3), %%mm4 ;\n"
- " pxor 40(%3), %%mm5 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " pxor 48(%3), %%mm6 ;\n"
- " pxor 56(%3), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
-
- " addl $64, %1 ;\n"
- " addl $64, %2 ;\n"
- " addl $64, %3 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data)
- : "memory" );
- break;
- case 4:
- __asm__ __volatile__ (
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
- " movq (%1), %%mm0 ;\n"
- " movq 8(%1), %%mm1 ;\n"
- " pxor (%2), %%mm0 ;\n"
- " movq 16(%1), %%mm2 ;\n"
- " pxor 8(%2), %%mm1 ;\n"
- " pxor (%3), %%mm0 ;\n"
- " pxor 16(%2), %%mm2 ;\n"
- " pxor 8(%3), %%mm1 ;\n"
- " pxor (%4), %%mm0 ;\n"
- " movq 24(%1), %%mm3 ;\n"
- " pxor 16(%3), %%mm2 ;\n"
- " pxor 8(%4), %%mm1 ;\n"
- " movq %%mm0, (%1) ;\n"
- " movq 32(%1), %%mm4 ;\n"
- " pxor 24(%2), %%mm3 ;\n"
- " pxor 16(%4), %%mm2 ;\n"
- " movq %%mm1, 8(%1) ;\n"
- " movq 40(%1), %%mm5 ;\n"
- " pxor 32(%2), %%mm4 ;\n"
- " pxor 24(%3), %%mm3 ;\n"
- " movq %%mm2, 16(%1) ;\n"
- " pxor 40(%2), %%mm5 ;\n"
- " pxor 32(%3), %%mm4 ;\n"
- " pxor 24(%4), %%mm3 ;\n"
- " movq %%mm3, 24(%1) ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " pxor 40(%3), %%mm5 ;\n"
- " pxor 32(%4), %%mm4 ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " pxor 40(%4), %%mm5 ;\n"
- " pxor 48(%3), %%mm6 ;\n"
- " pxor 56(%3), %%mm7 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 48(%4), %%mm6 ;\n"
- " pxor 56(%4), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
-
- " addl $64, %1 ;\n"
- " addl $64, %2 ;\n"
- " addl $64, %3 ;\n"
- " addl $64, %4 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "r" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data),
- "r" (bh_ptr[3]->b_data)
- : "memory" );
- break;
- case 5:
- __asm__ __volatile__ (
-
- " .align 32,0x90 ;\n"
- " 1: ;\n"
- " movq (%1), %%mm0 ;\n"
- " movq 8(%1), %%mm1 ;\n"
- " pxor (%2), %%mm0 ;\n"
- " pxor 8(%2), %%mm1 ;\n"
- " movq 16(%1), %%mm2 ;\n"
- " pxor (%3), %%mm0 ;\n"
- " pxor 8(%3), %%mm1 ;\n"
- " pxor 16(%2), %%mm2 ;\n"
- " pxor (%4), %%mm0 ;\n"
- " pxor 8(%4), %%mm1 ;\n"
- " pxor 16(%3), %%mm2 ;\n"
- " movq 24(%1), %%mm3 ;\n"
- " pxor (%5), %%mm0 ;\n"
- " pxor 8(%5), %%mm1 ;\n"
- " movq %%mm0, (%1) ;\n"
- " pxor 16(%4), %%mm2 ;\n"
- " pxor 24(%2), %%mm3 ;\n"
- " movq %%mm1, 8(%1) ;\n"
- " pxor 16(%5), %%mm2 ;\n"
- " pxor 24(%3), %%mm3 ;\n"
- " movq 32(%1), %%mm4 ;\n"
- " movq %%mm2, 16(%1) ;\n"
- " pxor 24(%4), %%mm3 ;\n"
- " pxor 32(%2), %%mm4 ;\n"
- " movq 40(%1), %%mm5 ;\n"
- " pxor 24(%5), %%mm3 ;\n"
- " pxor 32(%3), %%mm4 ;\n"
- " pxor 40(%2), %%mm5 ;\n"
- " movq %%mm3, 24(%1) ;\n"
- " pxor 32(%4), %%mm4 ;\n"
- " pxor 40(%3), %%mm5 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " pxor 32(%5), %%mm4 ;\n"
- " pxor 40(%4), %%mm5 ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " pxor 48(%3), %%mm6 ;\n"
- " pxor 56(%3), %%mm7 ;\n"
- " pxor 40(%5), %%mm5 ;\n"
- " pxor 48(%4), %%mm6 ;\n"
- " pxor 56(%4), %%mm7 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 48(%5), %%mm6 ;\n"
- " pxor 56(%5), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
-
- " addl $64, %1 ;\n"
- " addl $64, %2 ;\n"
- " addl $64, %3 ;\n"
- " addl $64, %4 ;\n"
- " addl $64, %5 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
-
- :
- : "g" (lines),
- "r" (bh_ptr[0]->b_data),
- "r" (bh_ptr[1]->b_data),
- "r" (bh_ptr[2]->b_data),
- "r" (bh_ptr[3]->b_data),
- "r" (bh_ptr[4]->b_data)
- : "memory" );
- break;
- }
-
- __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
- if (!(current->flags & PF_USEDFPU))
- stts();
-}
-#endif /* __i386__ */
-#endif /* !__sparc_v9__ */
-
-#ifdef __sparc_v9__
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- * Requirements:
- * !(((long)dest | (long)sourceN) & (64 - 1)) &&
- * !(len & 127) && len >= 256
- *
- * It is done in pure assembly, as otherwise gcc makes it
- * a non-leaf function, which is not what we want.
- * Also, we don't measure the speeds as on other architectures,
- * as the measuring routine does not take into account cold caches
- * and the fact that xor_block_VIS bypasses the caches.
- * xor_block_32regs might be 5% faster for count 2 if caches are hot
- * and things just right (for count 3 VIS is about as fast as 32regs for
- * hot caches and for count 4 and 5 VIS is faster by good margin always),
- * but I think it is better not to pollute the caches.
- * Actually, if I'd just fight for speed for hot caches, I could
- * write a hybrid VIS/integer routine, which would do always two
- * 64B blocks in VIS and two in IEUs, but I really care more about
- * caches.
- */
-extern void *VISenter(void);
-extern void xor_block_VIS XOR_ARGS;
-
-void __xor_block_VIS(void)
-{
-__asm__ ("
- .globl xor_block_VIS
-xor_block_VIS:
- ldx [%%o1 + 0], %%o4
- ldx [%%o1 + 8], %%o3
- ldx [%%o4 + %1], %%g5
- ldx [%%o4 + %0], %%o4
- ldx [%%o3 + %0], %%o3
- rd %%fprs, %%o5
- andcc %%o5, %2, %%g0
- be,pt %%icc, 297f
- sethi %%hi(%5), %%g1
- jmpl %%g1 + %%lo(%5), %%g7
- add %%g7, 8, %%g7
-297: wr %%g0, %4, %%fprs
- membar #LoadStore|#StoreLoad|#StoreStore
- sub %%g5, 64, %%g5
- ldda [%%o4] %3, %%f0
- ldda [%%o3] %3, %%f16
- cmp %%o0, 4
- bgeu,pt %%xcc, 10f
- cmp %%o0, 3
- be,pn %%xcc, 13f
- mov -64, %%g1
- sub %%g5, 64, %%g5
- rd %%asi, %%g1
- wr %%g0, %3, %%asi
-
-2: ldda [%%o4 + 64] %%asi, %%f32
- fxor %%f0, %%f16, %%f16
- fxor %%f2, %%f18, %%f18
- fxor %%f4, %%f20, %%f20
- fxor %%f6, %%f22, %%f22
- fxor %%f8, %%f24, %%f24
- fxor %%f10, %%f26, %%f26
- fxor %%f12, %%f28, %%f28
- fxor %%f14, %%f30, %%f30
- stda %%f16, [%%o4] %3
- ldda [%%o3 + 64] %%asi, %%f48
- ldda [%%o4 + 128] %%asi, %%f0
- fxor %%f32, %%f48, %%f48
- fxor %%f34, %%f50, %%f50
- add %%o4, 128, %%o4
- fxor %%f36, %%f52, %%f52
- add %%o3, 128, %%o3
- fxor %%f38, %%f54, %%f54
- subcc %%g5, 128, %%g5
- fxor %%f40, %%f56, %%f56
- fxor %%f42, %%f58, %%f58
- fxor %%f44, %%f60, %%f60
- fxor %%f46, %%f62, %%f62
- stda %%f48, [%%o4 - 64] %%asi
- bne,pt %%xcc, 2b
- ldda [%%o3] %3, %%f16
-
- ldda [%%o4 + 64] %%asi, %%f32
- fxor %%f0, %%f16, %%f16
- fxor %%f2, %%f18, %%f18
- fxor %%f4, %%f20, %%f20
- fxor %%f6, %%f22, %%f22
- fxor %%f8, %%f24, %%f24
- fxor %%f10, %%f26, %%f26
- fxor %%f12, %%f28, %%f28
- fxor %%f14, %%f30, %%f30
- stda %%f16, [%%o4] %3
- ldda [%%o3 + 64] %%asi, %%f48
- membar #Sync
- fxor %%f32, %%f48, %%f48
- fxor %%f34, %%f50, %%f50
- fxor %%f36, %%f52, %%f52
- fxor %%f38, %%f54, %%f54
- fxor %%f40, %%f56, %%f56
- fxor %%f42, %%f58, %%f58
- fxor %%f44, %%f60, %%f60
- fxor %%f46, %%f62, %%f62
- stda %%f48, [%%o4 + 64] %%asi
- membar #Sync|#StoreStore|#StoreLoad
- wr %%g0, 0, %%fprs
- retl
- wr %%g1, %%g0, %%asi
-
-13: ldx [%%o1 + 16], %%o2
- ldx [%%o2 + %0], %%o2
-
-3: ldda [%%o2] %3, %%f32
- fxor %%f0, %%f16, %%f48
- fxor %%f2, %%f18, %%f50
- add %%o4, 64, %%o4
- fxor %%f4, %%f20, %%f52
- fxor %%f6, %%f22, %%f54
- add %%o3, 64, %%o3
- fxor %%f8, %%f24, %%f56
- fxor %%f10, %%f26, %%f58
- fxor %%f12, %%f28, %%f60
- fxor %%f14, %%f30, %%f62
- ldda [%%o4] %3, %%f0
- fxor %%f48, %%f32, %%f48
- fxor %%f50, %%f34, %%f50
- fxor %%f52, %%f36, %%f52
- fxor %%f54, %%f38, %%f54
- add %%o2, 64, %%o2
- fxor %%f56, %%f40, %%f56
- fxor %%f58, %%f42, %%f58
- subcc %%g5, 64, %%g5
- fxor %%f60, %%f44, %%f60
- fxor %%f62, %%f46, %%f62
- stda %%f48, [%%o4 + %%g1] %3
- bne,pt %%xcc, 3b
- ldda [%%o3] %3, %%f16
-
- ldda [%%o2] %3, %%f32
- fxor %%f0, %%f16, %%f48
- fxor %%f2, %%f18, %%f50
- fxor %%f4, %%f20, %%f52
- fxor %%f6, %%f22, %%f54
- fxor %%f8, %%f24, %%f56
- fxor %%f10, %%f26, %%f58
- fxor %%f12, %%f28, %%f60
- fxor %%f14, %%f30, %%f62
- membar #Sync
- fxor %%f48, %%f32, %%f48
- fxor %%f50, %%f34, %%f50
- fxor %%f52, %%f36, %%f52
- fxor %%f54, %%f38, %%f54
- fxor %%f56, %%f40, %%f56
- fxor %%f58, %%f42, %%f58
- fxor %%f60, %%f44, %%f60
- fxor %%f62, %%f46, %%f62
- stda %%f48, [%%o4] %3
- membar #Sync|#StoreStore|#StoreLoad
- retl
- wr %%g0, 0, %%fprs
-
-10: cmp %%o0, 5
- be,pt %%xcc, 15f
- mov -64, %%g1
-
-14: ldx [%%o1 + 16], %%o2
- ldx [%%o1 + 24], %%o0
- ldx [%%o2 + %0], %%o2
- ldx [%%o0 + %0], %%o0
-
-4: ldda [%%o2] %3, %%f32
- fxor %%f0, %%f16, %%f16
- fxor %%f2, %%f18, %%f18
- add %%o4, 64, %%o4
- fxor %%f4, %%f20, %%f20
- fxor %%f6, %%f22, %%f22
- add %%o3, 64, %%o3
- fxor %%f8, %%f24, %%f24
- fxor %%f10, %%f26, %%f26
- fxor %%f12, %%f28, %%f28
- fxor %%f14, %%f30, %%f30
- ldda [%%o0] %3, %%f48
- fxor %%f16, %%f32, %%f32
- fxor %%f18, %%f34, %%f34
- fxor %%f20, %%f36, %%f36
- fxor %%f22, %%f38, %%f38
- add %%o2, 64, %%o2
- fxor %%f24, %%f40, %%f40
- fxor %%f26, %%f42, %%f42
- fxor %%f28, %%f44, %%f44
- fxor %%f30, %%f46, %%f46
- ldda [%%o4] %3, %%f0
- fxor %%f32, %%f48, %%f48
- fxor %%f34, %%f50, %%f50
- fxor %%f36, %%f52, %%f52
- add %%o0, 64, %%o0
- fxor %%f38, %%f54, %%f54
- fxor %%f40, %%f56, %%f56
- fxor %%f42, %%f58, %%f58
- subcc %%g5, 64, %%g5
- fxor %%f44, %%f60, %%f60
- fxor %%f46, %%f62, %%f62
- stda %%f48, [%%o4 + %%g1] %3
- bne,pt %%xcc, 4b
- ldda [%%o3] %3, %%f16
-
- ldda [%%o2] %3, %%f32
- fxor %%f0, %%f16, %%f16
- fxor %%f2, %%f18, %%f18
- fxor %%f4, %%f20, %%f20
- fxor %%f6, %%f22, %%f22
- fxor %%f8, %%f24, %%f24
- fxor %%f10, %%f26, %%f26
- fxor %%f12, %%f28, %%f28
- fxor %%f14, %%f30, %%f30
- ldda [%%o0] %3, %%f48
- fxor %%f16, %%f32, %%f32
- fxor %%f18, %%f34, %%f34
- fxor %%f20, %%f36, %%f36
- fxor %%f22, %%f38, %%f38
- fxor %%f24, %%f40, %%f40
- fxor %%f26, %%f42, %%f42
- fxor %%f28, %%f44, %%f44
- fxor %%f30, %%f46, %%f46
- membar #Sync
- fxor %%f32, %%f48, %%f48
- fxor %%f34, %%f50, %%f50
- fxor %%f36, %%f52, %%f52
- fxor %%f38, %%f54, %%f54
- fxor %%f40, %%f56, %%f56
- fxor %%f42, %%f58, %%f58
- fxor %%f44, %%f60, %%f60
- fxor %%f46, %%f62, %%f62
- stda %%f48, [%%o4] %3
- membar #Sync|#StoreStore|#StoreLoad
- retl
- wr %%g0, 0, %%fprs
-
-15: ldx [%%o1 + 16], %%o2
- ldx [%%o1 + 24], %%o0
- ldx [%%o1 + 32], %%o1
- ldx [%%o2 + %0], %%o2
- ldx [%%o0 + %0], %%o0
- ldx [%%o1 + %0], %%o1
-
-5: ldda [%%o2] %3, %%f32
- fxor %%f0, %%f16, %%f48
- fxor %%f2, %%f18, %%f50
- add %%o4, 64, %%o4
- fxor %%f4, %%f20, %%f52
- fxor %%f6, %%f22, %%f54
- add %%o3, 64, %%o3
- fxor %%f8, %%f24, %%f56
- fxor %%f10, %%f26, %%f58
- fxor %%f12, %%f28, %%f60
- fxor %%f14, %%f30, %%f62
- ldda [%%o0] %3, %%f16
- fxor %%f48, %%f32, %%f48
- fxor %%f50, %%f34, %%f50
- fxor %%f52, %%f36, %%f52
- fxor %%f54, %%f38, %%f54
- add %%o2, 64, %%o2
- fxor %%f56, %%f40, %%f56
- fxor %%f58, %%f42, %%f58
- fxor %%f60, %%f44, %%f60
- fxor %%f62, %%f46, %%f62
- ldda [%%o1] %3, %%f32
- fxor %%f48, %%f16, %%f48
- fxor %%f50, %%f18, %%f50
- add %%o0, 64, %%o0
- fxor %%f52, %%f20, %%f52
- fxor %%f54, %%f22, %%f54
- add %%o1, 64, %%o1
- fxor %%f56, %%f24, %%f56
- fxor %%f58, %%f26, %%f58
- fxor %%f60, %%f28, %%f60
- fxor %%f62, %%f30, %%f62
- ldda [%%o4] %3, %%f0
- fxor %%f48, %%f32, %%f48
- fxor %%f50, %%f34, %%f50
- fxor %%f52, %%f36, %%f52
- fxor %%f54, %%f38, %%f54
- fxor %%f56, %%f40, %%f56
- fxor %%f58, %%f42, %%f58
- subcc %%g5, 64, %%g5
- fxor %%f60, %%f44, %%f60
- fxor %%f62, %%f46, %%f62
- stda %%f48, [%%o4 + %%g1] %3
- bne,pt %%xcc, 5b
- ldda [%%o3] %3, %%f16
-
- ldda [%%o2] %3, %%f32
- fxor %%f0, %%f16, %%f48
- fxor %%f2, %%f18, %%f50
- fxor %%f4, %%f20, %%f52
- fxor %%f6, %%f22, %%f54
- fxor %%f8, %%f24, %%f56
- fxor %%f10, %%f26, %%f58
- fxor %%f12, %%f28, %%f60
- fxor %%f14, %%f30, %%f62
- ldda [%%o0] %3, %%f16
- fxor %%f48, %%f32, %%f48
- fxor %%f50, %%f34, %%f50
- fxor %%f52, %%f36, %%f52
- fxor %%f54, %%f38, %%f54
- fxor %%f56, %%f40, %%f56
- fxor %%f58, %%f42, %%f58
- fxor %%f60, %%f44, %%f60
- fxor %%f62, %%f46, %%f62
- ldda [%%o1] %3, %%f32
- fxor %%f48, %%f16, %%f48
- fxor %%f50, %%f18, %%f50
- fxor %%f52, %%f20, %%f52
- fxor %%f54, %%f22, %%f54
- fxor %%f56, %%f24, %%f56
- fxor %%f58, %%f26, %%f58
- fxor %%f60, %%f28, %%f60
- fxor %%f62, %%f30, %%f62
- membar #Sync
- fxor %%f48, %%f32, %%f48
- fxor %%f50, %%f34, %%f50
- fxor %%f52, %%f36, %%f52
- fxor %%f54, %%f38, %%f54
- fxor %%f56, %%f40, %%f56
- fxor %%f58, %%f42, %%f58
- fxor %%f60, %%f44, %%f60
- fxor %%f62, %%f46, %%f62
- stda %%f48, [%%o4] %3
- membar #Sync|#StoreStore|#StoreLoad
- retl
- wr %%g0, 0, %%fprs
- " : :
- "i" (&((struct buffer_head *)0)->b_data),
- "i" (&((struct buffer_head *)0)->b_size),
- "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
- "i" (FPRS_FEF), "i" (VISenter));
-}
-#endif /* __sparc_v9__ */
-
-#if defined(__sparc__) && !defined(__sparc_v9__)
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * ldd/std SPARC instructions.
- *
- * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- */
-
-XORBLOCK_TEMPLATE(SPARC)
-{
- int size = bh_ptr[0]->b_size;
- int lines = size / (sizeof (long)) / 8, i;
- long *destp = (long *) bh_ptr[0]->b_data;
- long *source1 = (long *) bh_ptr[1]->b_data;
- long *source2, *source3, *source4;
-
- switch (count) {
- case 2:
- for (i = lines; i > 0; i--) {
- __asm__ __volatile__("
- ldd [%0 + 0x00], %%g2
- ldd [%0 + 0x08], %%g4
- ldd [%0 + 0x10], %%o0
- ldd [%0 + 0x18], %%o2
- ldd [%1 + 0x00], %%o4
- ldd [%1 + 0x08], %%l0
- ldd [%1 + 0x10], %%l2
- ldd [%1 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- std %%g2, [%0 + 0x00]
- std %%g4, [%0 + 0x08]
- std %%o0, [%0 + 0x10]
- std %%o2, [%0 + 0x18]
- " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0",
- "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
- destp += 8;
- source1 += 8;
- }
- break;
- case 3:
- source2 = (long *) bh_ptr[2]->b_data;
- for (i = lines; i > 0; i--) {
- __asm__ __volatile__("
- ldd [%0 + 0x00], %%g2
- ldd [%0 + 0x08], %%g4
- ldd [%0 + 0x10], %%o0
- ldd [%0 + 0x18], %%o2
- ldd [%1 + 0x00], %%o4
- ldd [%1 + 0x08], %%l0
- ldd [%1 + 0x10], %%l2
- ldd [%1 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- ldd [%2 + 0x00], %%o4
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- ldd [%2 + 0x08], %%l0
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- ldd [%2 + 0x10], %%l2
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- ldd [%2 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- std %%g2, [%0 + 0x00]
- std %%g4, [%0 + 0x08]
- std %%o0, [%0 + 0x10]
- std %%o2, [%0 + 0x18]
- " : : "r" (destp), "r" (source1), "r" (source2)
- : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
- "l0", "l1", "l2", "l3", "l4", "l5");
- destp += 8;
- source1 += 8;
- source2 += 8;
- }
- break;
- case 4:
- source2 = (long *) bh_ptr[2]->b_data;
- source3 = (long *) bh_ptr[3]->b_data;
- for (i = lines; i > 0; i--) {
- __asm__ __volatile__("
- ldd [%0 + 0x00], %%g2
- ldd [%0 + 0x08], %%g4
- ldd [%0 + 0x10], %%o0
- ldd [%0 + 0x18], %%o2
- ldd [%1 + 0x00], %%o4
- ldd [%1 + 0x08], %%l0
- ldd [%1 + 0x10], %%l2
- ldd [%1 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- ldd [%2 + 0x00], %%o4
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- ldd [%2 + 0x08], %%l0
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- ldd [%2 + 0x10], %%l2
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- ldd [%2 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- ldd [%3 + 0x00], %%o4
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- ldd [%3 + 0x08], %%l0
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- ldd [%3 + 0x10], %%l2
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- ldd [%3 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- std %%g2, [%0 + 0x00]
- std %%g4, [%0 + 0x08]
- std %%o0, [%0 + 0x10]
- std %%o2, [%0 + 0x18]
- " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
- : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
- "l0", "l1", "l2", "l3", "l4", "l5");
- destp += 8;
- source1 += 8;
- source2 += 8;
- source3 += 8;
- }
- break;
- case 5:
- source2 = (long *) bh_ptr[2]->b_data;
- source3 = (long *) bh_ptr[3]->b_data;
- source4 = (long *) bh_ptr[4]->b_data;
- for (i = lines; i > 0; i--) {
- __asm__ __volatile__("
- ldd [%0 + 0x00], %%g2
- ldd [%0 + 0x08], %%g4
- ldd [%0 + 0x10], %%o0
- ldd [%0 + 0x18], %%o2
- ldd [%1 + 0x00], %%o4
- ldd [%1 + 0x08], %%l0
- ldd [%1 + 0x10], %%l2
- ldd [%1 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- ldd [%2 + 0x00], %%o4
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- ldd [%2 + 0x08], %%l0
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- ldd [%2 + 0x10], %%l2
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- ldd [%2 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- ldd [%3 + 0x00], %%o4
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- ldd [%3 + 0x08], %%l0
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- ldd [%3 + 0x10], %%l2
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- ldd [%3 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- ldd [%4 + 0x00], %%o4
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- ldd [%4 + 0x08], %%l0
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- ldd [%4 + 0x10], %%l2
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- ldd [%4 + 0x18], %%l4
- xor %%g2, %%o4, %%g2
- xor %%g3, %%o5, %%g3
- xor %%g4, %%l0, %%g4
- xor %%g5, %%l1, %%g5
- xor %%o0, %%l2, %%o0
- xor %%o1, %%l3, %%o1
- xor %%o2, %%l4, %%o2
- xor %%o3, %%l5, %%o3
- std %%g2, [%0 + 0x00]
- std %%g4, [%0 + 0x08]
- std %%o0, [%0 + 0x10]
- std %%o2, [%0 + 0x18]
- " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
- : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
- "l0", "l1", "l2", "l3", "l4", "l5");
- destp += 8;
- source1 += 8;
- source2 += 8;
- source3 += 8;
- source4 += 8;
- }
- break;
- }
-}
-#endif /* __sparc_v[78]__ */
-
-#ifdef __alpha__
-/*
- * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5.
- * There is a second version using EV6 prefetch instructions.
- *
- * Copyright (C) 2000 Richard Henderson (rth@redhat.com)
- */
-
-XORBLOCK_TEMPLATE(alpha)
-{
- long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
- long *d = (long *) bh_ptr[0]->b_data;
- long *s1 = (long *) bh_ptr[1]->b_data;
- long *s2, *s3, *s4;
-
- if (count == 2) goto two_blocks;
-
- s2 = (long *) bh_ptr[2]->b_data;
- if (count == 3) goto three_blocks;
-
- s3 = (long *) bh_ptr[3]->b_data;
- if (count == 4) goto four_blocks;
-
- s4 = (long *) bh_ptr[4]->b_data;
- goto five_blocks;
-
-two_blocks:
-asm volatile ("
- .align 4
-2:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,8(%0)
- ldq $3,8(%1)
-
- ldq $4,16(%0)
- ldq $5,16(%1)
- ldq $6,24(%0)
- ldq $7,24(%1)
-
- ldq $16,32(%0)
- ldq $17,32(%1)
- ldq $18,40(%0)
- ldq $19,40(%1)
-
- ldq $20,48(%0)
- ldq $21,48(%1)
- ldq $22,56(%0)
- xor $0,$1,$0 # 7 cycles from $1 load
-
- ldq $23,56(%1)
- xor $2,$3,$2
- stq $0,0(%0)
- xor $4,$5,$4
-
- stq $2,8(%0)
- xor $6,$7,$6
- stq $4,16(%0)
- xor $16,$17,$16
-
- stq $6,24(%0)
- xor $18,$19,$18
- stq $16,32(%0)
- xor $20,$21,$20
-
- stq $18,40(%0)
- xor $22,$23,$22
- stq $20,48(%0)
- subq %2,1,%2
-
- stq $22,56(%0)
- addq %0,64,%0
- addq %1,64,%1
- bgt %2,2b"
- : "=r"(d), "=r"(s1), "=r"(lines)
- : "0"(d), "1"(s1), "2"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
- return;
-
-three_blocks:
-asm volatile ("
- .align 4
-3:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,0(%2)
- ldq $3,8(%0)
-
- ldq $4,8(%1)
- ldq $6,16(%0)
- ldq $7,16(%1)
- ldq $17,24(%0)
-
- ldq $18,24(%1)
- ldq $20,32(%0)
- ldq $21,32(%1)
- ldq $5,8(%2)
-
- ldq $16,16(%2)
- ldq $19,24(%2)
- ldq $22,32(%2)
- nop
-
- xor $0,$1,$1 # 8 cycles from $0 load
- xor $3,$4,$4 # 6 cycles from $4 load
- xor $6,$7,$7 # 6 cycles from $7 load
- xor $17,$18,$18 # 5 cycles from $18 load
-
- xor $1,$2,$2 # 9 cycles from $2 load
- xor $20,$21,$21 # 5 cycles from $21 load
- stq $2,0(%0)
- xor $4,$5,$5 # 6 cycles from $5 load
-
- stq $5,8(%0)
- xor $7,$16,$16 # 7 cycles from $16 load
- stq $16,16(%0)
- xor $18,$19,$19 # 7 cycles from $19 load
-
- stq $19,24(%0)
- xor $21,$22,$22 # 7 cycles from $22 load
- stq $22,32(%0)
- nop
-
- ldq $0,40(%0)
- ldq $1,40(%1)
- ldq $3,48(%0)
- ldq $4,48(%1)
-
- ldq $6,56(%0)
- ldq $7,56(%1)
- ldq $2,40(%2)
- ldq $5,48(%2)
-
- ldq $16,56(%2)
- xor $0,$1,$1 # 4 cycles from $1 load
- xor $3,$4,$4 # 5 cycles from $4 load
- xor $6,$7,$7 # 5 cycles from $7 load
-
- xor $1,$2,$2 # 4 cycles from $2 load
- xor $4,$5,$5 # 5 cycles from $5 load
- stq $2,40(%0)
- xor $7,$16,$16 # 4 cycles from $16 load
-
- stq $5,48(%0)
- subq %3,1,%3
- stq $16,56(%0)
- addq %2,64,%2
-
- addq %1,64,%1
- addq %0,64,%0
- bgt %3,3b"
- : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
- : "0"(d), "1"(s1), "2"(s2), "3"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21", "$22");
- return;
-
-four_blocks:
-asm volatile ("
- .align 4
-4:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,0(%2)
- ldq $3,0(%3)
-
- ldq $4,8(%0)
- ldq $5,8(%1)
- ldq $6,8(%2)
- ldq $7,8(%3)
-
- ldq $16,16(%0)
- ldq $17,16(%1)
- ldq $18,16(%2)
- ldq $19,16(%3)
-
- ldq $20,24(%0)
- xor $0,$1,$1 # 6 cycles from $1 load
- ldq $21,24(%1)
- xor $2,$3,$3 # 6 cycles from $3 load
-
- ldq $0,24(%2)
- xor $1,$3,$3
- ldq $1,24(%3)
- xor $4,$5,$5 # 7 cycles from $5 load
-
- stq $3,0(%0)
- xor $6,$7,$7
- xor $16,$17,$17 # 7 cycles from $17 load
- xor $5,$7,$7
-
- stq $7,8(%0)
- xor $18,$19,$19 # 7 cycles from $19 load
- ldq $2,32(%0)
- xor $17,$19,$19
-
- ldq $3,32(%1)
- ldq $4,32(%2)
- ldq $5,32(%3)
- xor $20,$21,$21 # 8 cycles from $21 load
-
- ldq $6,40(%0)
- ldq $7,40(%1)
- ldq $16,40(%2)
- ldq $17,40(%3)
-
- stq $19,16(%0)
- xor $0,$1,$1 # 9 cycles from $1 load
- xor $2,$3,$3 # 5 cycles from $3 load
- xor $21,$1,$1
-
- ldq $18,48(%0)
- xor $4,$5,$5 # 5 cycles from $5 load
- ldq $19,48(%1)
- xor $3,$5,$5
-
- ldq $20,48(%2)
- ldq $21,48(%3)
- ldq $0,56(%0)
- ldq $1,56(%1)
-
- ldq $2,56(%2)
- xor $6,$7,$7 # 8 cycles from $6 load
- ldq $3,56(%3)
- xor $16,$17,$17 # 8 cycles from $17 load
-
- xor $7,$17,$17
- xor $18,$19,$19 # 5 cycles from $19 load
- xor $20,$21,$21 # 5 cycles from $21 load
- xor $19,$21,$21
-
- stq $1,24(%0)
- xor $0,$1,$1 # 5 cycles from $1 load
- stq $5,32(%0)
- xor $2,$3,$3 # 4 cycles from $3 load
-
- stq $17,40(%0)
- xor $1,$3,$3
- stq $21,48(%0)
- subq %4,1,%4
-
- stq $3,56(%0)
- addq %3,64,%3
- addq %2,64,%2
- addq %1,64,%1
-
- addq %0,64,%0
- bgt %4,4b"
- : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
- : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21");
- return;
-
-five_blocks:
-asm volatile ("
- ldq %0,0(%6)
- ldq %1,8(%6)
- ldq %2,16(%6)
- ldq %3,24(%6)
- ldq %4,32(%6)
- ldq %0,%7(%0)
- ldq %1,%7(%1)
- ldq %2,%7(%2)
- ldq %3,%7(%3)
- ldq %4,%7(%4)
- .align 4
-5:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,0(%2)
- ldq $3,0(%3)
-
- ldq $4,0(%4)
- ldq $5,8(%0)
- ldq $6,8(%1)
- ldq $7,8(%2)
-
- ldq $16,8(%3)
- ldq $17,8(%4)
- ldq $18,16(%0)
- ldq $19,16(%1)
-
- ldq $20,16(%2)
- xor $0,$1,$1 # 6 cycles from $1 load
- ldq $21,16(%3)
- xor $2,$3,$3 # 6 cycles from $3 load
-
- ldq $0,16(%4)
- xor $1,$3,$3
- ldq $1,24(%0)
- xor $3,$4,$4 # 7 cycles from $4 load
-
- stq $4,0(%0)
- xor $5,$6,$6 # 7 cycles from $6 load
- xor $7,$16,$16 # 7 cycles from $16 load
- xor $6,$17,$17 # 7 cycles from $17 load
-
- ldq $2,24(%1)
- xor $16,$17,$17
- ldq $3,24(%2)
- xor $18,$19,$19 # 8 cycles from $19 load
-
- stq $17,8(%0)
- xor $19,$20,$20 # 8 cycles from $20 load
- ldq $4,24(%3)
- xor $21,$0,$0 # 7 cycles from $0 load
-
- ldq $5,24(%4)
- xor $20,$0,$0
- ldq $6,32(%0)
- ldq $7,32(%1)
-
- stq $0,16(%0)
- xor $1,$2,$2 # 6 cycles from $2 load
- ldq $16,32(%2)
- xor $3,$4,$4 # 4 cycles from $4 load
-
- ldq $17,32(%3)
- xor $2,$4,$4
- ldq $18,32(%4)
- ldq $19,40(%0)
-
- ldq $20,40(%1)
- ldq $21,40(%2)
- ldq $0,40(%3)
- xor $4,$5,$5 # 7 cycles from $5 load
-
- stq $5,24(%0)
- xor $6,$7,$7 # 7 cycles from $7 load
- ldq $1,40(%4)
- ldq $2,48(%0)
-
- ldq $3,48(%1)
- xor $7,$16,$16 # 7 cycles from $16 load
- ldq $4,48(%2)
- xor $17,$18,$18 # 6 cycles from $18 load
-
- ldq $5,48(%3)
- xor $16,$18,$18
- ldq $6,48(%4)
- xor $19,$20,$20 # 7 cycles from $20 load
-
- stq $18,32(%0)
- xor $20,$21,$21 # 8 cycles from $21 load
- ldq $7,56(%0)
- xor $0,$1,$1 # 6 cycles from $1 load
-
- ldq $16,56(%1)
- ldq $17,56(%2)
- ldq $18,56(%3)
- ldq $19,56(%4)
-
- xor $21,$1,$1
- xor $2,$3,$3 # 9 cycles from $3 load
- xor $3,$4,$4 # 9 cycles from $4 load
- xor $5,$6,$6 # 8 cycles from $6 load
-
- unop
- xor $4,$6,$6
- xor $7,$16,$16 # 7 cycles from $16 load
- xor $17,$18,$18 # 6 cycles from $18 load
-
- stq $6,48(%0)
- xor $16,$18,$18
- subq %5,1,%5
- xor $18,$19,$19 # 8 cycles from $19 load
-
- stq $19,56(%0)
- addq %4,64,%4
- addq %3,64,%3
- addq %2,64,%2
-
- addq %1,64,%1
- addq %0,64,%0
- bgt %5,5b"
- : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
- /* ARG! We've run out of asm arguments! We've got to reload
- all those pointers we just loaded. */
- : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21");
- return;
-}
-
-#define prefetch(base, ofs) \
- asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs))
-
-XORBLOCK_TEMPLATE(alpha_prefetch)
-{
- long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
- long *d = (long *) bh_ptr[0]->b_data;
- long *s1 = (long *) bh_ptr[1]->b_data;
- long *s2, *s3, *s4;
- long p;
-
- p = count == 2;
- prefetch(d, 0);
- prefetch(s1, 0);
- prefetch(d, 64);
- prefetch(s1, 64);
- prefetch(d, 128);
- prefetch(s1, 128);
- prefetch(d, 192);
- prefetch(s1, 192);
- if (p) goto two_blocks;
-
- s2 = (long *) bh_ptr[2]->b_data;
- p = count == 3;
- prefetch(s2, 0);
- prefetch(s2, 64);
- prefetch(s2, 128);
- prefetch(s2, 192);
- if (p) goto three_blocks;
-
- s3 = (long *) bh_ptr[3]->b_data;
- p = count == 4;
- prefetch(s3, 0);
- prefetch(s3, 64);
- prefetch(s3, 128);
- prefetch(s3, 192);
- if (p) goto four_blocks;
-
- s4 = (long *) bh_ptr[4]->b_data;
- prefetch(s4, 0);
- prefetch(s4, 64);
- prefetch(s4, 128);
- prefetch(s4, 192);
- goto five_blocks;
-
-two_blocks:
-asm volatile ("
- .align 4
-2:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,8(%0)
- ldq $3,8(%1)
-
- ldq $4,16(%0)
- ldq $5,16(%1)
- ldq $6,24(%0)
- ldq $7,24(%1)
-
- ldq $16,32(%0)
- ldq $17,32(%1)
- ldq $18,40(%0)
- ldq $19,40(%1)
-
- ldq $20,48(%0)
- ldq $21,48(%1)
- ldq $22,56(%0)
- ldq $23,56(%1)
-
- ldq $31,256(%0)
- xor $0,$1,$0 # 8 cycles from $1 load
- ldq $31,256(%1)
- xor $2,$3,$2
-
- stq $0,0(%0)
- xor $4,$5,$4
- stq $2,8(%0)
- xor $6,$7,$6
-
- stq $4,16(%0)
- xor $16,$17,$16
- stq $6,24(%0)
- xor $18,$19,$18
-
- stq $16,32(%0)
- xor $20,$21,$20
- stq $18,40(%0)
- xor $22,$23,$22
-
- stq $20,48(%0)
- subq %2,1,%2
- stq $22,56(%0)
- addq %0,64,%0
-
- addq %1,64,%1
- bgt %2,2b"
- : "=r"(d), "=r"(s1), "=r"(lines)
- : "0"(d), "1"(s1), "2"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
- return;
-
-three_blocks:
-asm volatile ("
- .align 4
-3:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,0(%2)
- ldq $3,8(%0)
-
- ldq $4,8(%1)
- ldq $6,16(%0)
- ldq $7,16(%1)
- ldq $17,24(%0)
-
- ldq $18,24(%1)
- ldq $20,32(%0)
- ldq $21,32(%1)
- ldq $5,8(%2)
-
- ldq $16,16(%2)
- ldq $19,24(%2)
- ldq $22,32(%2)
- nop
-
- xor $0,$1,$1 # 8 cycles from $0 load
- xor $3,$4,$4 # 7 cycles from $4 load
- xor $6,$7,$7 # 6 cycles from $7 load
- xor $17,$18,$18 # 5 cycles from $18 load
-
- xor $1,$2,$2 # 9 cycles from $2 load
- xor $20,$21,$21 # 5 cycles from $21 load
- stq $2,0(%0)
- xor $4,$5,$5 # 6 cycles from $5 load
-
- stq $5,8(%0)
- xor $7,$16,$16 # 7 cycles from $16 load
- stq $16,16(%0)
- xor $18,$19,$19 # 7 cycles from $19 load
-
- stq $19,24(%0)
- xor $21,$22,$22 # 7 cycles from $22 load
- stq $22,32(%0)
- nop
-
- ldq $0,40(%0)
- ldq $1,40(%1)
- ldq $3,48(%0)
- ldq $4,48(%1)
-
- ldq $6,56(%0)
- ldq $7,56(%1)
- ldq $2,40(%2)
- ldq $5,48(%2)
-
- ldq $16,56(%2)
- ldq $31,256(%0)
- ldq $31,256(%1)
- ldq $31,256(%2)
-
- xor $0,$1,$1 # 6 cycles from $1 load
- xor $3,$4,$4 # 5 cycles from $4 load
- xor $6,$7,$7 # 5 cycles from $7 load
- xor $1,$2,$2 # 4 cycles from $2 load
-
- xor $4,$5,$5 # 5 cycles from $5 load
- xor $7,$16,$16 # 4 cycles from $16 load
- stq $2,40(%0)
- subq %3,1,%3
-
- stq $5,48(%0)
- addq %2,64,%2
- stq $16,56(%0)
- addq %1,64,%1
-
- addq %0,64,%0
- bgt %3,3b"
- : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
- : "0"(d), "1"(s1), "2"(s2), "3"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21", "$22");
- return;
-
-four_blocks:
-asm volatile ("
- .align 4
-4:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,0(%2)
- ldq $3,0(%3)
-
- ldq $4,8(%0)
- ldq $5,8(%1)
- ldq $6,8(%2)
- ldq $7,8(%3)
-
- ldq $16,16(%0)
- ldq $17,16(%1)
- ldq $18,16(%2)
- ldq $19,16(%3)
-
- ldq $20,24(%0)
- xor $0,$1,$1 # 6 cycles from $1 load
- ldq $21,24(%1)
- xor $2,$3,$3 # 6 cycles from $3 load
-
- ldq $0,24(%2)
- xor $1,$3,$3
- ldq $1,24(%3)
- xor $4,$5,$5 # 7 cycles from $5 load
-
- stq $3,0(%0)
- xor $6,$7,$7
- xor $16,$17,$17 # 7 cycles from $17 load
- xor $5,$7,$7
-
- stq $7,8(%0)
- xor $18,$19,$19 # 7 cycles from $19 load
- ldq $2,32(%0)
- xor $17,$19,$19
-
- ldq $3,32(%1)
- ldq $4,32(%2)
- ldq $5,32(%3)
- xor $20,$21,$21 # 8 cycles from $21 load
-
- ldq $6,40(%0)
- ldq $7,40(%1)
- ldq $16,40(%2)
- ldq $17,40(%3)
-
- stq $19,16(%0)
- xor $0,$1,$1 # 9 cycles from $1 load
- xor $2,$3,$3 # 5 cycles from $3 load
- xor $21,$1,$1
-
- ldq $18,48(%0)
- xor $4,$5,$5 # 5 cycles from $5 load
- ldq $19,48(%1)
- xor $3,$5,$5
-
- ldq $20,48(%2)
- ldq $21,48(%3)
- ldq $0,56(%0)
- ldq $1,56(%1)
-
- ldq $2,56(%2)
- xor $6,$7,$7 # 8 cycles from $6 load
- ldq $3,56(%3)
- xor $16,$17,$17 # 8 cycles from $17 load
-
- ldq $31,256(%0)
- xor $7,$17,$17
- ldq $31,256(%1)
- xor $18,$19,$19 # 6 cycles from $19 load
-
- ldq $31,256(%2)
- xor $20,$21,$21 # 6 cycles from $21 load
- ldq $31,256(%3)
- xor $19,$21,$21
-
- stq $1,24(%0)
- xor $0,$1,$1 # 7 cycles from $1 load
- stq $5,32(%0)
- xor $2,$3,$3 # 6 cycles from $3 load
-
- stq $17,40(%0)
- xor $1,$3,$3
- stq $21,48(%0)
- subq %4,1,%4
-
- stq $3,56(%0)
- addq %3,64,%3
- addq %2,64,%2
- addq %1,64,%1
-
- addq %0,64,%0
- bgt %4,4b"
- : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
- : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21");
- return;
-
-five_blocks:
-asm volatile ("
- ldq %0,0(%6)
- ldq %1,8(%6)
- ldq %2,16(%6)
- ldq %3,24(%6)
- ldq %4,32(%6)
- ldq %0,%7(%0)
- ldq %1,%7(%1)
- ldq %2,%7(%2)
- ldq %3,%7(%3)
- ldq %4,%7(%4)
- .align 4
-5:
- ldq $0,0(%0)
- ldq $1,0(%1)
- ldq $2,0(%2)
- ldq $3,0(%3)
-
- ldq $4,0(%4)
- ldq $5,8(%0)
- ldq $6,8(%1)
- ldq $7,8(%2)
-
- ldq $16,8(%3)
- ldq $17,8(%4)
- ldq $18,16(%0)
- ldq $19,16(%1)
-
- ldq $20,16(%2)
- xor $0,$1,$1 # 6 cycles from $1 load
- ldq $21,16(%3)
- xor $2,$3,$3 # 6 cycles from $3 load
-
- ldq $0,16(%4)
- xor $1,$3,$3
- ldq $1,24(%0)
- xor $3,$4,$4 # 7 cycles from $4 load
-
- stq $4,0(%0)
- xor $5,$6,$6 # 7 cycles from $6 load
- xor $7,$16,$16 # 7 cycles from $16 load
- xor $6,$17,$17 # 7 cycles from $17 load
-
- ldq $2,24(%1)
- xor $16,$17,$17
- ldq $3,24(%2)
- xor $18,$19,$19 # 8 cycles from $19 load
-
- stq $17,8(%0)
- xor $19,$20,$20 # 8 cycles from $20 load
- ldq $4,24(%3)
- xor $21,$0,$0 # 7 cycles from $0 load
-
- ldq $5,24(%4)
- xor $20,$0,$0
- ldq $6,32(%0)
- ldq $7,32(%1)
-
- stq $0,16(%0)
- xor $1,$2,$2 # 6 cycles from $2 load
- ldq $16,32(%2)
- xor $3,$4,$4 # 4 cycles from $4 load
-
- ldq $17,32(%3)
- xor $2,$4,$4
- ldq $18,32(%4)
- ldq $19,40(%0)
-
- ldq $20,40(%1)
- ldq $21,40(%2)
- ldq $0,40(%3)
- xor $4,$5,$5 # 7 cycles from $5 load
-
- stq $5,24(%0)
- xor $6,$7,$7 # 7 cycles from $7 load
- ldq $1,40(%4)
- ldq $2,48(%0)
-
- ldq $3,48(%1)
- xor $7,$16,$16 # 7 cycles from $16 load
- ldq $4,48(%2)
- xor $17,$18,$18 # 6 cycles from $18 load
-
- ldq $5,48(%3)
- xor $16,$18,$18
- ldq $6,48(%4)
- xor $19,$20,$20 # 7 cycles from $20 load
-
- stq $18,32(%0)
- xor $20,$21,$21 # 8 cycles from $21 load
- ldq $7,56(%0)
- xor $0,$1,$1 # 6 cycles from $1 load
-
- ldq $16,56(%1)
- ldq $17,56(%2)
- ldq $18,56(%3)
- ldq $19,56(%4)
-
- ldq $31,256(%0)
- xor $21,$1,$1
- ldq $31,256(%1)
- xor $2,$3,$3 # 9 cycles from $3 load
-
- ldq $31,256(%2)
- xor $3,$4,$4 # 9 cycles from $4 load
- ldq $31,256(%3)
- xor $5,$6,$6 # 8 cycles from $6 load
-
- ldq $31,256(%4)
- xor $4,$6,$6
- xor $7,$16,$16 # 7 cycles from $16 load
- xor $17,$18,$18 # 6 cycles from $18 load
-
- stq $6,48(%0)
- xor $16,$18,$18
- subq %5,1,%5
- xor $18,$19,$19 # 8 cycles from $19 load
-
- stq $19,56(%0)
- addq %4,64,%4
- addq %3,64,%3
- addq %2,64,%2
-
- addq %1,64,%1
- addq %0,64,%0
- bgt %5,5b"
- : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
- /* ARG! We've run out of asm arguments! We've got to reload
- all those pointers we just loaded. */
- : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
- : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
- "$16", "$17", "$18", "$19", "$20", "$21");
- return;
-}
-
-#undef prefetch
-
-#endif /* __alpha__ */
-
-#ifndef __sparc_v9__
-
-/*
- * this one works reasonably on any x86 CPU
- * (send me an assembly version for inclusion if you can make it faster)
- *
- * this one is just as fast as written in pure assembly on x86.
- * the reason for this separate version is that the
- * fast open-coded xor routine "32reg" produces suboptimal code
- * on x86, due to lack of registers.
- */
-XORBLOCK_TEMPLATE(8regs)
-{
- int len = bh_ptr[0]->b_size;
- long *destp = (long *) bh_ptr[0]->b_data;
- long *source1, *source2, *source3, *source4;
- long lines = len / (sizeof (long)) / 8, i;
-
- switch(count) {
- case 2:
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- *(destp + 0) ^= *(source1 + 0);
- *(destp + 1) ^= *(source1 + 1);
- *(destp + 2) ^= *(source1 + 2);
- *(destp + 3) ^= *(source1 + 3);
- *(destp + 4) ^= *(source1 + 4);
- *(destp + 5) ^= *(source1 + 5);
- *(destp + 6) ^= *(source1 + 6);
- *(destp + 7) ^= *(source1 + 7);
- source1 += 8;
- destp += 8;
- }
- break;
- case 3:
- source2 = (long *) bh_ptr[2]->b_data;
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- *(destp + 0) ^= *(source1 + 0);
- *(destp + 0) ^= *(source2 + 0);
- *(destp + 1) ^= *(source1 + 1);
- *(destp + 1) ^= *(source2 + 1);
- *(destp + 2) ^= *(source1 + 2);
- *(destp + 2) ^= *(source2 + 2);
- *(destp + 3) ^= *(source1 + 3);
- *(destp + 3) ^= *(source2 + 3);
- *(destp + 4) ^= *(source1 + 4);
- *(destp + 4) ^= *(source2 + 4);
- *(destp + 5) ^= *(source1 + 5);
- *(destp + 5) ^= *(source2 + 5);
- *(destp + 6) ^= *(source1 + 6);
- *(destp + 6) ^= *(source2 + 6);
- *(destp + 7) ^= *(source1 + 7);
- *(destp + 7) ^= *(source2 + 7);
- source1 += 8;
- source2 += 8;
- destp += 8;
- }
- break;
- case 4:
- source3 = (long *) bh_ptr[3]->b_data;
- source2 = (long *) bh_ptr[2]->b_data;
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- *(destp + 0) ^= *(source1 + 0);
- *(destp + 0) ^= *(source2 + 0);
- *(destp + 0) ^= *(source3 + 0);
- *(destp + 1) ^= *(source1 + 1);
- *(destp + 1) ^= *(source2 + 1);
- *(destp + 1) ^= *(source3 + 1);
- *(destp + 2) ^= *(source1 + 2);
- *(destp + 2) ^= *(source2 + 2);
- *(destp + 2) ^= *(source3 + 2);
- *(destp + 3) ^= *(source1 + 3);
- *(destp + 3) ^= *(source2 + 3);
- *(destp + 3) ^= *(source3 + 3);
- *(destp + 4) ^= *(source1 + 4);
- *(destp + 4) ^= *(source2 + 4);
- *(destp + 4) ^= *(source3 + 4);
- *(destp + 5) ^= *(source1 + 5);
- *(destp + 5) ^= *(source2 + 5);
- *(destp + 5) ^= *(source3 + 5);
- *(destp + 6) ^= *(source1 + 6);
- *(destp + 6) ^= *(source2 + 6);
- *(destp + 6) ^= *(source3 + 6);
- *(destp + 7) ^= *(source1 + 7);
- *(destp + 7) ^= *(source2 + 7);
- *(destp + 7) ^= *(source3 + 7);
- source1 += 8;
- source2 += 8;
- source3 += 8;
- destp += 8;
- }
- break;
- case 5:
- source4 = (long *) bh_ptr[4]->b_data;
- source3 = (long *) bh_ptr[3]->b_data;
- source2 = (long *) bh_ptr[2]->b_data;
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- *(destp + 0) ^= *(source1 + 0);
- *(destp + 0) ^= *(source2 + 0);
- *(destp + 0) ^= *(source3 + 0);
- *(destp + 0) ^= *(source4 + 0);
- *(destp + 1) ^= *(source1 + 1);
- *(destp + 1) ^= *(source2 + 1);
- *(destp + 1) ^= *(source3 + 1);
- *(destp + 1) ^= *(source4 + 1);
- *(destp + 2) ^= *(source1 + 2);
- *(destp + 2) ^= *(source2 + 2);
- *(destp + 2) ^= *(source3 + 2);
- *(destp + 2) ^= *(source4 + 2);
- *(destp + 3) ^= *(source1 + 3);
- *(destp + 3) ^= *(source2 + 3);
- *(destp + 3) ^= *(source3 + 3);
- *(destp + 3) ^= *(source4 + 3);
- *(destp + 4) ^= *(source1 + 4);
- *(destp + 4) ^= *(source2 + 4);
- *(destp + 4) ^= *(source3 + 4);
- *(destp + 4) ^= *(source4 + 4);
- *(destp + 5) ^= *(source1 + 5);
- *(destp + 5) ^= *(source2 + 5);
- *(destp + 5) ^= *(source3 + 5);
- *(destp + 5) ^= *(source4 + 5);
- *(destp + 6) ^= *(source1 + 6);
- *(destp + 6) ^= *(source2 + 6);
- *(destp + 6) ^= *(source3 + 6);
- *(destp + 6) ^= *(source4 + 6);
- *(destp + 7) ^= *(source1 + 7);
- *(destp + 7) ^= *(source2 + 7);
- *(destp + 7) ^= *(source3 + 7);
- *(destp + 7) ^= *(source4 + 7);
- source1 += 8;
- source2 += 8;
- source3 += 8;
- source4 += 8;
- destp += 8;
- }
- break;
- }
-}
-
-/*
- * platform independent RAID5 checksum calculation, this should
- * be very fast on any platform that has a decent amount of
- * registers. (32 or more)
- */
-XORBLOCK_TEMPLATE(32regs)
-{
- int size = bh_ptr[0]->b_size;
- int lines = size / (sizeof (long)) / 8, i;
- long *destp = (long *) bh_ptr[0]->b_data;
- long *source1, *source2, *source3, *source4;
-
- /* LOTS of registers available...
- We do explicite loop-unrolling here for code which
- favours RISC machines. In fact this is almoast direct
- RISC assembly on Alpha and SPARC :-) */
-
-
- switch(count) {
- case 2:
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- register long d0, d1, d2, d3, d4, d5, d6, d7;
- d0 = destp[0]; /* Pull the stuff into registers */
- d1 = destp[1]; /* ... in bursts, if possible. */
- d2 = destp[2];
- d3 = destp[3];
- d4 = destp[4];
- d5 = destp[5];
- d6 = destp[6];
- d7 = destp[7];
- d0 ^= source1[0];
- d1 ^= source1[1];
- d2 ^= source1[2];
- d3 ^= source1[3];
- d4 ^= source1[4];
- d5 ^= source1[5];
- d6 ^= source1[6];
- d7 ^= source1[7];
- destp[0] = d0; /* Store the result (in burts) */
- destp[1] = d1;
- destp[2] = d2;
- destp[3] = d3;
- destp[4] = d4; /* Store the result (in burts) */
- destp[5] = d5;
- destp[6] = d6;
- destp[7] = d7;
- source1 += 8;
- destp += 8;
- }
- break;
- case 3:
- source2 = (long *) bh_ptr[2]->b_data;
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- register long d0, d1, d2, d3, d4, d5, d6, d7;
- d0 = destp[0]; /* Pull the stuff into registers */
- d1 = destp[1]; /* ... in bursts, if possible. */
- d2 = destp[2];
- d3 = destp[3];
- d4 = destp[4];
- d5 = destp[5];
- d6 = destp[6];
- d7 = destp[7];
- d0 ^= source1[0];
- d1 ^= source1[1];
- d2 ^= source1[2];
- d3 ^= source1[3];
- d4 ^= source1[4];
- d5 ^= source1[5];
- d6 ^= source1[6];
- d7 ^= source1[7];
- d0 ^= source2[0];
- d1 ^= source2[1];
- d2 ^= source2[2];
- d3 ^= source2[3];
- d4 ^= source2[4];
- d5 ^= source2[5];
- d6 ^= source2[6];
- d7 ^= source2[7];
- destp[0] = d0; /* Store the result (in burts) */
- destp[1] = d1;
- destp[2] = d2;
- destp[3] = d3;
- destp[4] = d4; /* Store the result (in burts) */
- destp[5] = d5;
- destp[6] = d6;
- destp[7] = d7;
- source1 += 8;
- source2 += 8;
- destp += 8;
- }
- break;
- case 4:
- source3 = (long *) bh_ptr[3]->b_data;
- source2 = (long *) bh_ptr[2]->b_data;
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- register long d0, d1, d2, d3, d4, d5, d6, d7;
- d0 = destp[0]; /* Pull the stuff into registers */
- d1 = destp[1]; /* ... in bursts, if possible. */
- d2 = destp[2];
- d3 = destp[3];
- d4 = destp[4];
- d5 = destp[5];
- d6 = destp[6];
- d7 = destp[7];
- d0 ^= source1[0];
- d1 ^= source1[1];
- d2 ^= source1[2];
- d3 ^= source1[3];
- d4 ^= source1[4];
- d5 ^= source1[5];
- d6 ^= source1[6];
- d7 ^= source1[7];
- d0 ^= source2[0];
- d1 ^= source2[1];
- d2 ^= source2[2];
- d3 ^= source2[3];
- d4 ^= source2[4];
- d5 ^= source2[5];
- d6 ^= source2[6];
- d7 ^= source2[7];
- d0 ^= source3[0];
- d1 ^= source3[1];
- d2 ^= source3[2];
- d3 ^= source3[3];
- d4 ^= source3[4];
- d5 ^= source3[5];
- d6 ^= source3[6];
- d7 ^= source3[7];
- destp[0] = d0; /* Store the result (in burts) */
- destp[1] = d1;
- destp[2] = d2;
- destp[3] = d3;
- destp[4] = d4; /* Store the result (in burts) */
- destp[5] = d5;
- destp[6] = d6;
- destp[7] = d7;
- source1 += 8;
- source2 += 8;
- source3 += 8;
- destp += 8;
- }
- break;
- case 5:
- source4 = (long *) bh_ptr[4]->b_data;
- source3 = (long *) bh_ptr[3]->b_data;
- source2 = (long *) bh_ptr[2]->b_data;
- source1 = (long *) bh_ptr[1]->b_data;
- for (i = lines; i > 0; i--) {
- register long d0, d1, d2, d3, d4, d5, d6, d7;
- d0 = destp[0]; /* Pull the stuff into registers */
- d1 = destp[1]; /* ... in bursts, if possible. */
- d2 = destp[2];
- d3 = destp[3];
- d4 = destp[4];
- d5 = destp[5];
- d6 = destp[6];
- d7 = destp[7];
- d0 ^= source1[0];
- d1 ^= source1[1];
- d2 ^= source1[2];
- d3 ^= source1[3];
- d4 ^= source1[4];
- d5 ^= source1[5];
- d6 ^= source1[6];
- d7 ^= source1[7];
- d0 ^= source2[0];
- d1 ^= source2[1];
- d2 ^= source2[2];
- d3 ^= source2[3];
- d4 ^= source2[4];
- d5 ^= source2[5];
- d6 ^= source2[6];
- d7 ^= source2[7];
- d0 ^= source3[0];
- d1 ^= source3[1];
- d2 ^= source3[2];
- d3 ^= source3[3];
- d4 ^= source3[4];
- d5 ^= source3[5];
- d6 ^= source3[6];
- d7 ^= source3[7];
- d0 ^= source4[0];
- d1 ^= source4[1];
- d2 ^= source4[2];
- d3 ^= source4[3];
- d4 ^= source4[4];
- d5 ^= source4[5];
- d6 ^= source4[6];
- d7 ^= source4[7];
- destp[0] = d0; /* Store the result (in burts) */
- destp[1] = d1;
- destp[2] = d2;
- destp[3] = d3;
- destp[4] = d4; /* Store the result (in burts) */
- destp[5] = d5;
- destp[6] = d6;
- destp[7] = d7;
- source1 += 8;
- source2 += 8;
- source3 += 8;
- source4 += 8;
- destp += 8;
- }
- break;
- }
-}
-
-/*
- * (the -6*32 shift factor colors the cache)
- */
-#define SIZE (PAGE_SIZE-6*32)
-
-static void xor_speed ( struct xor_block_template * func,
- struct buffer_head *b1, struct buffer_head *b2)
-{
- int speed;
- unsigned long now;
- int i, count, max;
- struct buffer_head *bh_ptr[6];
-
- func->next = xor_functions;
- xor_functions = func;
- bh_ptr[0] = b1;
- bh_ptr[1] = b2;
-
- /*
- * count the number of XORs done during a whole jiffy.
- * calculate the speed of checksumming from this.
- * (we use a 2-page allocation to have guaranteed
- * color L1-cache layout)
- */
- max = 0;
- for (i = 0; i < 5; i++) {
- now = jiffies;
- count = 0;
- while (jiffies == now) {
- mb();
- func->xor_block(2,bh_ptr);
- mb();
- count++;
- mb();
- }
- if (count > max)
- max = count;
- }
-
- speed = max * (HZ*SIZE/1024);
- func->speed = speed;
-
- printk( " %-10s: %5d.%03d MB/sec\n", func->name,
- speed / 1000, speed % 1000);
-}
-
-static inline void pick_fastest_function(void)
-{
- struct xor_block_template *f, *fastest;
-
- fastest = xor_functions;
- for (f = fastest; f; f = f->next) {
- if (f->speed > fastest->speed)
- fastest = f;
- }
-#ifdef CONFIG_X86_XMM
- if (cpu_has_xmm) {
- /* we force the use of the KNI xor block because it
- can write around l2. we may also be able
- to load into the l1 only depending on how
- the cpu deals with a load to a line that is
- being prefetched.
- */
- fastest = &t_xor_block_pIII_kni;
- }
-#endif
-#ifdef __alpha__
- if (implver() == IMPLVER_EV6) {
- /* Force the use of alpha_prefetch if EV6, as it
- is significantly faster in the cold cache case. */
- fastest = &t_xor_block_alpha_prefetch;
- }
-#endif
- xor_block = fastest->xor_block;
- printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
- fastest->speed / 1000, fastest->speed % 1000);
-}
-
-static struct buffer_head b1, b2;
-
-void calibrate_xor_block(void)
-{
- if (xor_block)
- return;
- memset(&b1,0,sizeof(b1));
- b2 = b1;
-
- b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
- if (!b1.b_data) {
- pick_fastest_function();
- return;
- }
- b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
-
- b1.b_size = SIZE;
-
- printk(KERN_INFO "raid5: measuring checksumming speed\n");
-
- sti(); /* should be safe */
-
-#if defined(__sparc__) && !defined(__sparc_v9__)
- printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
- xor_speed(&t_xor_block_SPARC,&b1,&b2);
-#endif
-
-#ifdef CONFIG_X86_XMM
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
- xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
- }
-#endif /* CONFIG_X86_XMM */
-
-#ifdef __i386__
- if (md_cpu_has_mmx()) {
- printk(KERN_INFO
- "raid5: MMX detected, trying high-speed MMX checksum routines\n");
- xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
- xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
- }
-#endif /* __i386__ */
-
-#ifdef __alpha__
- xor_speed(&t_xor_block_alpha,&b1,&b2);
- xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2);
-#endif
-
- xor_speed(&t_xor_block_8regs,&b1,&b2);
- xor_speed(&t_xor_block_32regs,&b1,&b2);
-
- free_pages((unsigned long)b1.b_data,2);
- pick_fastest_function();
-}
-
-#else /* __sparc_v9__ */
-
-void calibrate_xor_block(void)
-{
- if (xor_block)
- return;
- printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
- xor_block = xor_block_VIS;
-}
-
-#endif /* __sparc_v9__ */
-
-MD_EXPORT_SYMBOL(xor_block);
-MD_EXPORT_SYMBOL(calibrate_xor_block);
-
-#ifdef MODULE
-int init_module(void)
-{
- calibrate_xor_block();
- return 0;
-}
-#endif
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
+ *
+ * Dispatch optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+#include <asm/xor.h>
+
+/* The xor routines to use. */
+static struct xor_block_template *active_template;
+
+void
+xor_block(unsigned int count, struct buffer_head **bh_ptr)
+{
+ unsigned long *p0, *p1, *p2, *p3, *p4;
+ unsigned long bytes = bh_ptr[0]->b_size;
+
+ p0 = (unsigned long *) bh_ptr[0]->b_data;
+ p1 = (unsigned long *) bh_ptr[1]->b_data;
+ if (count == 2) {
+ active_template->do_2(bytes, p0, p1);
+ return;
+ }
+
+ p2 = (unsigned long *) bh_ptr[2]->b_data;
+ if (count == 3) {
+ active_template->do_3(bytes, p0, p1, p2);
+ return;
+ }
+
+ p3 = (unsigned long *) bh_ptr[3]->b_data;
+ if (count == 4) {
+ active_template->do_4(bytes, p0, p1, p2, p3);
+ return;
+ }
+
+ p4 = (unsigned long *) bh_ptr[4]->b_data;
+ active_template->do_5(bytes, p0, p1, p2, p3, p4);
+}
+
+/* Set of all registered templates. */
+static struct xor_block_template *template_list;
+
+/* The -6*32 shift factor colors the cache. */
+#define BENCH_SIZE (PAGE_SIZE-6*32)
+
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+{
+ int speed;
+ unsigned long now;
+ int i, count, max;
+
+ tmpl->next = template_list;
+ template_list = tmpl;
+
+ /*
+ * Count the number of XORs done during a whole jiffy, and use
+ * this to calculate the speed of checksumming. We use a 2-page
+ * allocation to have guaranteed color L1-cache layout.
+ */
+ max = 0;
+ for (i = 0; i < 5; i++) {
+ now = jiffies;
+ count = 0;
+ while (jiffies == now) {
+ mb();
+ tmpl->do_2(BENCH_SIZE, b1, b2);
+ mb();
+ count++;
+ mb();
+ }
+ if (count > max)
+ max = count;
+ }
+
+ speed = max * (HZ * BENCH_SIZE / 1024);
+ tmpl->speed = speed;
+
+ printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
+ speed / 1000, speed % 1000);
+}
+
+static int
+calibrate_xor_block(void)
+{
+ void *b1, *b2;
+ struct xor_block_template *f, *fastest;
+
+ b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
+ if (! b1) {
+ printk("raid5: Yikes! No memory available.\n");
+ return -ENOMEM;
+ }
+ b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+
+ printk(KERN_INFO "raid5: measuring checksumming speed\n");
+ sti();
+
+#define xor_speed(templ) do_xor_speed((templ), b1, b2)
+
+ XOR_TRY_TEMPLATES;
+
+#undef xor_speed
+
+ free_pages((unsigned long)b1, 2);
+
+ fastest = template_list;
+ for (f = fastest; f; f = f->next)
+ if (f->speed > fastest->speed)
+ fastest = f;
+
+#ifdef XOR_SELECT_TEMPLATE
+ fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
+
+ active_template = fastest;
+ printk("raid5: using function: %s (%d.%03d MB/sec)\n",
+ fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+
+ return 0;
+}
+
+MD_EXPORT_SYMBOL(xor_block);
+
+module_init(calibrate_xor_block);
@@ -1129,7+1129,7 @@ static int nsc_ircc_hard_xmit_fir(struct sk_buff *skb, struct net_device *dev) if ((speed = irda_get_speed(skb)) != self->io.speed) {
/* Check for empty frame */
if (!skb->len) {
- nsc_ircc_change_speed_complete(self, speed);
+ nsc_ircc_change_speed(self, speed);
return 0;
} else
self->new_speed = speed;
@@ -207,8+207,10 @@ int __init a2091_detect(Scsi_Host_Template *tpnt) continue;
instance = scsi_register (tpnt, sizeof (struct WD33C93_hostdata));
- if(instance == NULL)
- continue;
+ if (instance == NULL) {
+ release_mem_region(address, 256);
+ continue;
+ }
instance->base = ZTWO_VADDR(address);
instance->irq = IRQ_AMIGA_PORTS;
instance->unique_id = z->slotaddr;
@@ -66,8+66,8 @@ static __inline__ long atomic_add_return(int i, atomic_t * v) long temp, result;
__asm__ __volatile__(
"1: ldl_l %0,%1\n"
+ " addl %0,%3,%2\n"
" addl %0,%3,%0\n"
- " mov %0,%2\n"
" stl_c %0,%1\n"
" beq %0,2f\n"
" mb\n"
@@ -84,8+84,8 @@ static __inline__ long atomic_sub_return(int i, atomic_t * v) long temp, result;
__asm__ __volatile__(
"1: ldl_l %0,%1\n"
+ " subl %0,%3,%2\n"
" subl %0,%3,%0\n"
- " mov %0,%2\n"
" stl_c %0,%1\n"
" beq %0,2f\n"
" mb\n"
__asm__("stw %1,%0" : "=m"(mem) : "r"(val))
#endif
+/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
+ a mechanism by which the user can annotate likely branch directions and
+ expect the blocks to be reordered appropriately. Define __builtin_expect
+ to nothing for earlier compilers. */
+
+#if __GNUC__ == 2 && __GNUC_MINOR__ < 96
+#define __builtin_expect(x, expected_value) (x)
+#endif
+
#endif /* __ALPHA_COMPILER_H */
+++ /dev/null
-#ifndef _ALPHA_SEMAPHORE_HELPER_H
-#define _ALPHA_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1999 Richard Henderson
- */
-
-/*
- * These two _must_ execute atomically wrt each other.
- *
- * This is trivially done with load_locked/store_cond,
- * which we have. Let the rest of the losers suck eggs.
- */
-
-static inline void
-wake_one_more(struct semaphore * sem)
-{
- atomic_inc(&sem->waking);
-}
-
-static inline int
-waking_non_zero(struct semaphore *sem)
-{
- long ret, tmp;
-
- /* An atomic conditional decrement. */
- __asm__ __volatile__(
- "1: ldl_l %1,%2\n"
- " blt %1,2f\n"
- " subl %1,1,%0\n"
- " stl_c %0,%2\n"
- " beq %0,3f\n"
- "2:\n"
- ".subsection 2\n"
- "3: br 1b\n"
- ".previous"
- : "=r"(ret), "=r"(tmp), "=m"(sem->waking.counter)
- : "0"(0));
-
- return ret > 0;
-}
-
-
-/*
- * waking_non_zero_interruptible:
- * 1 got the lock
- * 0 go to sleep
- * -EINTR interrupted
- *
- * We must undo the sem->count down_interruptible decrement
- * simultaneously and atomicly with the sem->waking adjustment,
- * otherwise we can race with wake_one_more.
- *
- * This is accomplished by doing a 64-bit ll/sc on the 2 32-bit words.
- */
-
-static inline int
-waking_non_zero_interruptible(struct semaphore *sem, struct task_struct *tsk)
-{
- long ret, tmp, tmp2, tmp3;
-
- /* "Equivalent" C. Note that we have to do this all without
- (taken) branches in order to be a valid ll/sc sequence.
-
- do {
- tmp = ldq_l;
- ret = 0;
- if (tmp >= 0) {
- tmp += 0xffffffff00000000;
- ret = 1;
- }
- else if (pending) {
- // Since -1 + 1 carries into the high word, we have
- // to be more careful adding 1 here.
- tmp = (tmp & 0xffffffff00000000)
- | ((tmp + 1) & 0x00000000ffffffff;
- ret = -EINTR;
- }
- else {
- break; // ideally. we don't actually break
- // since this is a predicate we don't
- // have, and is more trouble to build
- // than to elide the noop stq_c.
- }
- tmp = stq_c = tmp;
- } while (tmp == 0);
- */
-
- __asm__ __volatile__(
- "1: ldq_l %1,%4\n"
- " lda %0,0\n"
- " cmovne %5,%6,%0\n"
- " addq %1,1,%2\n"
- " and %1,%7,%3\n"
- " andnot %2,%7,%2\n"
- " cmovge %1,1,%0\n"
- " or %3,%2,%2\n"
- " addq %1,%7,%3\n"
- " cmovne %5,%2,%1\n"
- " cmovge %2,%3,%1\n"
- " stq_c %1,%4\n"
- " beq %1,3f\n"
- "2:\n"
- ".subsection 2\n"
- "3: br 1b\n"
- ".previous"
- : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3), "=m"(*sem)
- : "r"(signal_pending(tsk)), "r"(-EINTR),
- "r"(0xffffffff00000000));
-
- return ret;
-}
-
-/*
- * waking_non_zero_trylock is unused. we do everything in
- * down_trylock and let non-ll/sc hosts bounce around.
- */
-
-static inline int
-waking_non_zero_trylock(struct semaphore *sem)
-{
- return 0;
-}
-
-#endif
-#ifndef _ALPHA_SEMAPHORE_H
-#define _ALPHA_SEMAPHORE_H
-
-/*
- * SMP- and interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1996, 2000 Richard Henderson
- */
-
-#include <asm/current.h>
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-struct semaphore {
- /* Careful, inline assembly knows about the position of these two. */
- atomic_t count;
- atomic_t waking; /* biased by -1 */
- wait_queue_head_t wait;
-#if WAITQUEUE_DEBUG
- long __magic;
-#endif
-};
-
-#if WAITQUEUE_DEBUG
-# define __SEM_DEBUG_INIT(name) , (long)&(name).__magic
-#else
-# define __SEM_DEBUG_INIT(name)
-#endif
-
-#define __SEMAPHORE_INITIALIZER(name,count) \
- { ATOMIC_INIT(count), ATOMIC_INIT(-1), \
- __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \
- __SEM_DEBUG_INIT(name) }
-
-#define __MUTEX_INITIALIZER(name) \
- __SEMAPHORE_INITIALIZER(name,1)
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
- struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
-
-extern inline void sema_init(struct semaphore *sem, int val)
-{
- /*
- * Logically,
- * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
- * except that gcc produces better initializing by parts yet.
- */
-
- atomic_set(&sem->count, val);
- atomic_set(&sem->waking, -1);
- init_waitqueue_head(&sem->wait);
-#if WAITQUEUE_DEBUG
- sem->__magic = (long)&sem->__magic;
-#endif
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
- sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
- sema_init(sem, 0);
-}
-
-
-extern void __down(struct semaphore * sem);
-extern int __down_interruptible(struct semaphore * sem);
-extern int __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-/* All have custom assembly linkages. */
-extern void __down_failed(struct semaphore * sem);
-extern void __down_failed_interruptible(struct semaphore * sem);
-extern void __down_failed_trylock(struct semaphore * sem);
-extern void __up_wakeup(struct semaphore * sem);
-
-/*
- * Whee. Hidden out of line code is fun. The contention cases are
- * handled out of line in kernel/sched.c; arch/alpha/lib/semaphore.S
- * takes care of making sure we can call it without clobbering regs.
- */
-
-extern inline void down(struct semaphore * sem)
-{
- /* Given that we have to use particular hard registers to
- communicate with __down_failed anyway, reuse them in
- the atomic operation as well.
-
- __down_failed takes the semaphore address in $24, and
- it's return address in $28. The pv is loaded as usual.
- The gp is clobbered (in the module case) as usual. */
-
- /* This little bit of silliness is to get the GP loaded for
- a function that ordinarily wouldn't. Otherwise we could
- have it done by the macro directly, which can be optimized
- the linker. */
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
-#endif
-
- pv = __down_failed;
- __asm__ __volatile__ (
- "/* semaphore down operation */\n"
- "1: ldl_l $24,%1\n"
- " subl $24,1,$28\n"
- " subl $24,1,$24\n"
- " stl_c $28,%1\n"
- " beq $28,2f\n"
- " blt $24,3f\n"
- "4: mb\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: lda $24,%1\n"
- " jsr $28,($27),__down_failed\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(pv)
- : "m"(sem->count), "r"(pv)
- : "$24", "$28", "memory");
-}
-
-extern inline int down_interruptible(struct semaphore * sem)
-{
- /* __down_failed_interruptible takes the semaphore address in $24,
- and it's return address in $28. The pv is loaded as usual.
- The gp is clobbered (in the module case) as usual. The return
- value is in $24. */
-
- register int ret __asm__("$24");
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
-#endif
-
- pv = __down_failed_interruptible;
- __asm__ __volatile__ (
- "/* semaphore down interruptible operation */\n"
- "1: ldl_l $24,%2\n"
- " subl $24,1,$28\n"
- " subl $24,1,$24\n"
- " stl_c $28,%2\n"
- " beq $28,2f\n"
- " blt $24,3f\n"
- " mov $31,%0\n"
- "4: mb\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: lda $24,%2\n"
- " jsr $28,($27),__down_failed_interruptible\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(ret), "=r"(pv)
- : "m"(sem->count), "r"(pv)
- : "$28", "memory");
-
- return ret;
-}
-
-/*
- * down_trylock returns 0 on success, 1 if we failed to get the lock.
- *
- * We must manipulate count and waking simultaneously and atomically.
- * Do this by using ll/sc on the pair of 32-bit words.
- */
-
-extern inline int down_trylock(struct semaphore * sem)
-{
- long ret, tmp, tmp2, sub;
-
- /* "Equivalent" C. Note that we have to do this all without
- (taken) branches in order to be a valid ll/sc sequence.
-
- do {
- tmp = ldq_l;
- sub = 0x0000000100000000;
- ret = ((int)tmp <= 0); // count =< 0 ?
- if ((int)tmp >= 0) sub = 0; // count >= 0 ?
- // note that if count=0 subq overflows to the high
- // longword (i.e waking)
- ret &= ((long)tmp < 0); // waking < 0 ?
- sub += 1;
- if (ret)
- break;
- tmp -= sub;
- tmp = stq_c = tmp;
- } while (tmp == 0);
- */
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
-#endif
-
- __asm__ __volatile__(
- "1: ldq_l %1,%4\n"
- " lda %3,1\n"
- " addl %1,0,%2\n"
- " sll %3,32,%3\n"
- " cmple %2,0,%0\n"
- " cmovge %2,0,%3\n"
- " cmplt %1,0,%2\n"
- " addq %3,1,%3\n"
- " and %0,%2,%0\n"
- " bne %0,2f\n"
- " subq %1,%3,%1\n"
- " stq_c %1,%4\n"
- " beq %1,3f\n"
- "2:\n"
- ".subsection 2\n"
- "3: br 1b\n"
- ".previous"
- : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(sub)
- : "m"(*sem)
- : "memory");
-
- return ret;
-}
-
-extern inline void up(struct semaphore * sem)
-{
- /* Given that we have to use particular hard registers to
- communicate with __up_wakeup anyway, reuse them in
- the atomic operation as well.
-
- __up_wakeup takes the semaphore address in $24, and
- it's return address in $28. The pv is loaded as usual.
- The gp is clobbered (in the module case) as usual. */
-
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
-#endif
-
- pv = __up_wakeup;
- __asm__ __volatile__ (
- "/* semaphore up operation */\n"
- " mb\n"
- "1: ldl_l $24,%1\n"
- " addl $24,1,$28\n"
- " addl $24,1,$24\n"
- " stl_c $28,%1\n"
- " beq $28,2f\n"
- " ble $24,3f\n"
- "4:\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: lda $24,%1\n"
- " jsr $28,($27),__up_wakeup\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(pv)
- : "m"(sem->count), "r"(pv)
- : "$24", "$28", "memory");
-}
-
-
-/* rw mutexes (should that be mutices? =) -- throw rw
- * spinlocks and semaphores together, and this is what we
- * end up with...
- *
- * The lock is initialized to BIAS. This way, a writer
- * subtracts BIAS ands gets 0 for the case of an uncontended
- * lock. Readers decrement by 1 and see a positive value
- * when uncontended, negative if there are writers waiting
- * (in which case it goes to sleep).
- *
- * The value 0x01000000 supports up to 128 processors and
- * lots of processes. BIAS must be chosen such that subtracting
- * BIAS once per CPU will result in the int remaining
- * negative.
- * In terms of fairness, this should result in the lock
- * flopping back and forth between readers and writers
- * under heavy use.
- *
- * -ben
- *
- * Once we start supporting machines with more than 128 CPUs,
- * we should go for using a 64bit atomic type instead of 32bit
- * as counter. We shall probably go for bias 0x80000000 then,
- * so that single sethi can set it.
- *
- * -jj
- */
-
-#define RW_LOCK_BIAS 0x01000000
-
-struct rw_semaphore {
- int count;
- /* bit 0 means read bias granted;
- bit 1 means write bias granted. */
- unsigned granted;
- wait_queue_head_t wait;
- wait_queue_head_t write_bias_wait;
-#if WAITQUEUE_DEBUG
- long __magic;
- atomic_t readers;
- atomic_t writers;
-#endif
-};
-
-#if WAITQUEUE_DEBUG
-#define __RWSEM_DEBUG_INIT , ATOMIC_INIT(0), ATOMIC_INIT(0)
-#else
-#define __RWSEM_DEBUG_INIT /* */
-#endif
-
-#define __RWSEM_INITIALIZER(name,count) \
- { (count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \
- __WAIT_QUEUE_HEAD_INITIALIZER((name).write_bias_wait) \
- __SEM_DEBUG_INIT(name) __RWSEM_DEBUG_INIT }
-
-#define __DECLARE_RWSEM_GENERIC(name,count) \
- struct rw_semaphore name = __RWSEM_INITIALIZER(name,count)
-
-#define DECLARE_RWSEM(name) \
- __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS)
-#define DECLARE_RWSEM_READ_LOCKED(name) \
- __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS-1)
-#define DECLARE_RWSEM_WRITE_LOCKED(name) \
- __DECLARE_RWSEM_GENERIC(name, 0)
-
-extern inline void init_rwsem(struct rw_semaphore *sem)
-{
- sem->count = RW_LOCK_BIAS;
- sem->granted = 0;
- init_waitqueue_head(&sem->wait);
- init_waitqueue_head(&sem->write_bias_wait);
-#if WAITQUEUE_DEBUG
- sem->__magic = (long)&sem->__magic;
- atomic_set(&sem->readers, 0);
- atomic_set(&sem->writers, 0);
-#endif
-}
-
-/* All have custom assembly linkages. */
-extern void __down_read_failed(struct rw_semaphore *sem);
-extern void __down_write_failed(struct rw_semaphore *sem);
-extern void __rwsem_wake(struct rw_semaphore *sem, unsigned long readers);
-
-extern inline void down_read(struct rw_semaphore *sem)
-{
- /* Given that we have to use particular hard registers to
- communicate with __down_read_failed anyway, reuse them in
- the atomic operation as well.
-
- __down_read_failed takes the semaphore address in $24, the count
- we read in $25, and it's return address in $28. The pv is loaded
- as usual. The gp is clobbered (in the module case) as usual. */
-
- /* This little bit of silliness is to get the GP loaded for
- a function that ordinarily wouldn't. Otherwise we could
- have it done by the macro directly, which can be optimized
- the linker. */
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
-#endif
-
- pv = __down_read_failed;
- __asm__ __volatile__(
- "/* semaphore down_read operation */\n"
- "1: ldl_l $24,%1\n"
- " subl $24,1,$28\n"
- " subl $24,1,$25\n"
- " stl_c $28,%1\n"
- " beq $28,2f\n"
- " blt $25,3f\n"
- "4: mb\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: lda $24,%1\n"
- " jsr $28,($27),__down_read_failed\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(pv)
- : "m"(sem->count), "r"(pv)
- : "$24", "$25", "$28", "memory");
-
-#if WAITQUEUE_DEBUG
- if (sem->granted & 2)
- BUG();
- if (atomic_read(&sem->writers))
- BUG();
- atomic_inc(&sem->readers);
-#endif
-}
-
-extern inline void down_write(struct rw_semaphore *sem)
-{
- /* Given that we have to use particular hard registers to
- communicate with __down_write_failed anyway, reuse them in
- the atomic operation as well.
-
- __down_write_failed takes the semaphore address in $24, the count
- we read in $25, and it's return address in $28. The pv is loaded
- as usual. The gp is clobbered (in the module case) as usual. */
-
- /* This little bit of silliness is to get the GP loaded for
- a function that ordinarily wouldn't. Otherwise we could
- have it done by the macro directly, which can be optimized
- the linker. */
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
-#endif
-
- pv = __down_write_failed;
- __asm__ __volatile__(
- "/* semaphore down_write operation */\n"
- "1: ldl_l $24,%1\n"
- " ldah $28,%3($24)\n"
- " ldah $25,%3($24)\n"
- " stl_c $28,%1\n"
- " beq $28,2f\n"
- " bne $25,3f\n"
- "4: mb\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: lda $24,%1\n"
- " jsr $28,($27),__down_write_failed\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(pv)
- : "m"(sem->count), "r"(pv), "i"(-(RW_LOCK_BIAS >> 16))
- : "$24", "$25", "$28", "memory");
-
-#if WAITQUEUE_DEBUG
- if (atomic_read(&sem->writers))
- BUG();
- if (atomic_read(&sem->readers))
- BUG();
- if (sem->granted & 3)
- BUG();
- atomic_inc(&sem->writers);
-#endif
-}
-
-/* When a reader does a release, the only significant case is when
- there was a writer waiting, and we've * bumped the count to 0: we must
-wake the writer up. */
-
-extern inline void up_read(struct rw_semaphore *sem)
-{
- /* Given that we have to use particular hard registers to
- communicate with __rwsem_wake anyway, reuse them in
- the atomic operation as well.
-
- __rwsem_wake takes the semaphore address in $24, the
- number of waiting readers in $25, and it's return address
- in $28. The pv is loaded as usual. The gp is clobbered
- (in the module case) as usual. */
-
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
- if (sem->granted & 2)
- BUG();
- if (atomic_read(&sem->writers))
- BUG();
- atomic_dec(&sem->readers);
-#endif
-
- pv = __rwsem_wake;
- __asm__ __volatile__(
- "/* semaphore up_read operation */\n"
- " mb\n"
- "1: ldl_l $24,%1\n"
- " addl $24,1,$28\n"
- " addl $24,1,$24\n"
- " stl_c $28,%1\n"
- " beq $28,2f\n"
- " beq $24,3f\n"
- "4:\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: lda $24,%1\n"
- " mov 0,$25\n"
- " jsr $28,($27),__rwsem_wake\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(pv)
- : "m"(sem->count), "r"(pv)
- : "$24", "$25", "$28", "memory");
-}
-
-/* releasing the writer is easy -- just release it and
- * wake up any sleepers.
- */
-extern inline void up_write(struct rw_semaphore *sem)
-{
- /* Given that we have to use particular hard registers to
- communicate with __rwsem_wake anyway, reuse them in
- the atomic operation as well.
-
- __rwsem_wake takes the semaphore address in $24, the
- number of waiting readers in $25, and it's return address
- in $28. The pv is loaded as usual. The gp is clobbered
- (in the module case) as usual. */
-
- register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
- CHECK_MAGIC(sem->__magic);
- if (sem->granted & 3)
- BUG();
- if (atomic_read(&sem->readers))
- BUG();
- if (atomic_read(&sem->writers) != 1)
- BUG();
- atomic_dec(&sem->writers);
-#endif
-
- pv = __rwsem_wake;
- __asm__ __volatile__(
- "/* semaphore up_write operation */\n"
- " mb\n"
- "1: ldl_l $24,%1\n"
- " ldah $28,%3($24)\n"
- " stl_c $28,%1\n"
- " beq $28,2f\n"
- " blt $24,3f\n"
- "4:\n"
- ".subsection 2\n"
- "2: br 1b\n"
- "3: ldah $25,%3($24)\n"
- /* Only do the wake if we're no longer negative. */
- " blt $25,4b\n"
- " lda $24,%1\n"
- " jsr $28,($27),__rwsem_wake\n"
- " ldgp $29,0($28)\n"
- " br 4b\n"
- ".previous"
- : "=r"(pv)
- : "m"(sem->count), "r"(pv), "i"(RW_LOCK_BIAS >> 16)
- : "$24", "$25", "$28", "memory");
-}
-
-#endif
+#ifndef _ALPHA_SEMAPHORE_H
+#define _ALPHA_SEMAPHORE_H
+
+/*
+ * SMP- and interrupt-safe semaphores..
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ * (C) Copyright 1996, 2000 Richard Henderson
+ */
+
+#include <asm/current.h>
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+#define DEBUG_SEMAPHORE 0
+#define DEBUG_RW_SEMAPHORE 0
+
+struct semaphore {
+ /* Careful, inline assembly knows about the position of these two. */
+ atomic_t count __attribute__((aligned(8)));
+ atomic_t waking; /* biased by -1 */
+
+ wait_queue_head_t wait;
+#if WAITQUEUE_DEBUG
+ long __magic;
+#endif
+};
+
+#if WAITQUEUE_DEBUG
+# define __SEM_DEBUG_INIT(name) , (long)&(name).__magic
+#else
+# define __SEM_DEBUG_INIT(name)
+#endif
+
+#define __SEMAPHORE_INITIALIZER(name,count) \
+ { ATOMIC_INIT(count), ATOMIC_INIT(-1), \
+ __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) \
+ __SEM_DEBUG_INIT(name) }
+
+#define __MUTEX_INITIALIZER(name) \
+ __SEMAPHORE_INITIALIZER(name,1)
+
+#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
+ struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+
+#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
+#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+
+static inline void sema_init(struct semaphore *sem, int val)
+{
+ /*
+ * Logically,
+ * *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ * except that gcc produces better initializing by parts yet.
+ */
+
+ atomic_set(&sem->count, val);
+ atomic_set(&sem->waking, -1);
+ init_waitqueue_head(&sem->wait);
+#if WAITQUEUE_DEBUG
+ sem->__magic = (long)&sem->__magic;
+#endif
+}
+
+static inline void init_MUTEX (struct semaphore *sem)
+{
+ sema_init(sem, 1);
+}
+
+static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+{
+ sema_init(sem, 0);
+}
+
+extern void down(struct semaphore *);
+extern void __down_failed(struct semaphore *);
+extern int down_interruptible(struct semaphore *);
+extern int __down_failed_interruptible(struct semaphore *);
+extern int down_trylock(struct semaphore *);
+extern void up(struct semaphore *);
+extern void __up_wakeup(struct semaphore *);
+
+/*
+ * Hidden out of line code is fun, but extremely messy. Rely on newer
+ * compilers to do a respectable job with this. The contention cases
+ * are handled out of line in arch/alpha/kernel/semaphore.c.
+ */
+
+static inline void __down(struct semaphore *sem)
+{
+ long count = atomic_dec_return(&sem->count);
+ if (__builtin_expect(count < 0, 0))
+ __down_failed(sem);
+}
+
+static inline int __down_interruptible(struct semaphore *sem)
+{
+ long count = atomic_dec_return(&sem->count);
+ if (__builtin_expect(count < 0, 0))
+ return __down_failed_interruptible(sem);
+ return 0;
+}
+
+/*
+ * down_trylock returns 0 on success, 1 if we failed to get the lock.
+ *
+ * We must manipulate count and waking simultaneously and atomically.
+ * Do this by using ll/sc on the pair of 32-bit words.
+ */
+
+static inline int __down_trylock(struct semaphore * sem)
+{
+ long ret, tmp, tmp2, sub;
+
+ /* "Equivalent" C. Note that we have to do this all without
+ (taken) branches in order to be a valid ll/sc sequence.
+
+ do {
+ tmp = ldq_l;
+ sub = 0x0000000100000000;
+ ret = ((int)tmp <= 0); // count <= 0 ?
+ // Note that if count=0, the decrement overflows into
+ // waking, so cancel the 1 loaded above. Also cancel
+ // it if the lock was already free.
+ if ((int)tmp >= 0) sub = 0; // count >= 0 ?
+ ret &= ((long)tmp < 0); // waking < 0 ?
+ sub += 1;
+ if (ret) break;
+ tmp -= sub;
+ tmp = stq_c = tmp;
+ } while (tmp == 0);
+ */
+
+ __asm__ __volatile__(
+ "1: ldq_l %1,%4\n"
+ " lda %3,1\n"
+ " addl %1,0,%2\n"
+ " sll %3,32,%3\n"
+ " cmple %2,0,%0\n"
+ " cmovge %2,0,%3\n"
+ " cmplt %1,0,%2\n"
+ " addq %3,1,%3\n"
+ " and %0,%2,%0\n"
+ " bne %0,2f\n"
+ " subq %1,%3,%1\n"
+ " stq_c %1,%4\n"
+ " beq %1,3f\n"
+ "2: mb\n"
+ ".subsection 2\n"
+ "3: br 1b\n"
+ ".previous"
+ : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(sub)
+ : "m"(*sem)
+ : "memory");
+
+ return ret;
+}
+
+static inline void __up(struct semaphore *sem)
+{
+ long ret, tmp, tmp2, tmp3;
+
+ /* We must manipulate count and waking simultaneously and atomically.
+ Otherwise we have races between up and __down_failed_interruptible
+ waking up on a signal.
+
+ "Equivalent" C. Note that we have to do this all without
+ (taken) branches in order to be a valid ll/sc sequence.
+
+ do {
+ tmp = ldq_l;
+ ret = (int)tmp + 1; // count += 1;
+ tmp2 = tmp & 0xffffffff00000000; // extract waking
+ if (ret <= 0) // still sleepers?
+ tmp2 += 0x0000000100000000; // waking += 1;
+ tmp = ret & 0x00000000ffffffff; // insert count
+ tmp |= tmp2; // insert waking;
+ tmp = stq_c = tmp;
+ } while (tmp == 0);
+ */
+
+ __asm__ __volatile__(
+ " mb\n"
+ "1: ldq_l %1,%4\n"
+ " addl %1,1,%0\n"
+ " zapnot %1,0xf0,%2\n"
+ " addq %2,%5,%3\n"
+ " cmovle %0,%3,%2\n"
+ " zapnot %0,0x0f,%1\n"
+ " bis %1,%2,%1\n"
+ " stq_c %1,%4\n"
+ " beq %1,3f\n"
+ "2:\n"
+ ".subsection 2\n"
+ "3: br 1b\n"
+ ".previous"
+ : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
+ : "m"(*sem), "r"(0x0000000100000000)
+ : "memory");
+
+ if (__builtin_expect(ret <= 0, 0))
+ __up_wakeup(sem);
+}
+
+#if !WAITQUEUE_DEBUG && !DEBUG_SEMAPHORE
+extern inline void down(struct semaphore *sem)
+{
+ __down(sem);
+}
+extern inline int down_interruptible(struct semaphore *sem)
+{
+ return __down_interruptible(sem);
+}
+extern inline int down_trylock(struct semaphore *sem)
+{
+ return __down_trylock(sem);
+}
+extern inline void up(struct semaphore *sem)
+{
+ __up(sem);
+}
+#endif
+
+/* rw mutexes (should that be mutices? =) -- throw rw
+ * spinlocks and semaphores together, and this is what we
+ * end up with...
+ *
+ * The lock is initialized to BIAS. This way, a writer
+ * subtracts BIAS ands gets 0 for the case of an uncontended
+ * lock. Readers decrement by 1 and see a positive value
+ * when uncontended, negative if there are writers waiting
+ * (in which case it goes to sleep).
+ *
+ * The value 0x01000000 supports up to 128 processors and
+ * lots of processes. BIAS must be chosen such that subtracting
+ * BIAS once per CPU will result in the int remaining
+ * negative.
+ * In terms of fairness, this should result in the lock
+ * flopping back and forth between readers and writers
+ * under heavy use.
+ *
+ * -ben
+ *
+ * Once we start supporting machines with more than 128 CPUs,
+ * we should go for using a 64bit atomic type instead of 32bit
+ * as counter. We shall probably go for bias 0x80000000 then,
+ * so that single sethi can set it.
+ *
+ * -jj
+ */
+
+#define RW_LOCK_BIAS 0x01000000
+
+struct rw_semaphore {
+ atomic_t count;
+ /* bit 0 means read bias granted;
+ bit 1 means write bias granted. */
+ unsigned granted;
+ wait_queue_head_t wait;
+ wait_queue_head_t write_bias_wait;
+#if WAITQUEUE_DEBUG
+ long __magic;
+ atomic_t readers;
+ atomic_t writers;
+#endif
+};
+
+#if WAITQUEUE_DEBUG
+#define __RWSEM_DEBUG_INIT , ATOMIC_INIT(0), ATOMIC_INIT(0)
+#else
+#define __RWSEM_DEBUG_INIT /* */
+#endif
+
+#define __RWSEM_INITIALIZER(name,count) \
+ { ATOMIC_INIT(count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \
+ __WAIT_QUEUE_HEAD_INITIALIZER((name).write_bias_wait) \
+ __SEM_DEBUG_INIT(name) __RWSEM_DEBUG_INIT }
+
+#define __DECLARE_RWSEM_GENERIC(name,count) \
+ struct rw_semaphore name = __RWSEM_INITIALIZER(name,count)
+
+#define DECLARE_RWSEM(name) \
+ __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS)
+#define DECLARE_RWSEM_READ_LOCKED(name) \
+ __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS-1)
+#define DECLARE_RWSEM_WRITE_LOCKED(name) \
+ __DECLARE_RWSEM_GENERIC(name, 0)
+
+static inline void init_rwsem(struct rw_semaphore *sem)
+{
+ atomic_set (&sem->count, RW_LOCK_BIAS);
+ sem->granted = 0;
+ init_waitqueue_head(&sem->wait);
+ init_waitqueue_head(&sem->write_bias_wait);
+#if WAITQUEUE_DEBUG
+ sem->__magic = (long)&sem->__magic;
+ atomic_set(&sem->readers, 0);
+ atomic_set(&sem->writers, 0);
+#endif
+}
+
+extern void down_read(struct rw_semaphore *);
+extern void down_write(struct rw_semaphore *);
+extern void up_read(struct rw_semaphore *);
+extern void up_write(struct rw_semaphore *);
+extern void __down_read_failed(struct rw_semaphore *, int);
+extern void __down_write_failed(struct rw_semaphore *, int);
+extern void __rwsem_wake(struct rw_semaphore *, int);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ long count = atomic_dec_return(&sem->count);
+ if (__builtin_expect(count < 0, 0))
+ __down_read_failed(sem, count);
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ long count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
+ if (__builtin_expect(count != 0, 0))
+ __down_write_failed(sem, count);
+}
+
+/* When a reader does a release, the only significant case is when there
+ was a writer waiting, and we've bumped the count to 0, then we must
+ wake the writer up. */
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ long count;
+ mb();
+ count = atomic_inc_return(&sem->count);
+ if (__builtin_expect(count == 0, 0))
+ __rwsem_wake(sem, 0);
+}
+
+/* Releasing the writer is easy -- just release it and wake up
+ any sleepers. */
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long count, wake;
+ mb();
+ count = atomic_add_return(RW_LOCK_BIAS, &sem->count);
+
+ /* Only do the wake if we were, but are no longer, negative. */
+ wake = ((int)(count - RW_LOCK_BIAS) < 0) && count >= 0;
+ if (__builtin_expect(wake, 0))
+ __rwsem_wake(sem, count);
+}
+
+#if !WAITQUEUE_DEBUG && !DEBUG_RW_SEMAPHORE
+extern inline void down_read(struct rw_semaphore *sem)
+{
+ __down_read(sem);
+}
+extern inline void down_write(struct rw_semaphore *sem)
+{
+ __down_write(sem);
+}
+extern inline void up_read(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+extern inline void up_write(struct rw_semaphore *sem)
+{
+ __up_write(sem);
+}
+#endif
+
+#endif
@@ -80,7+80,7 @@ static inline void spin_lock(spinlock_t * lock) " blbs %0,2b\n"
" br 1b\n"
".previous"
- : "=r" (tmp), "=m" (lock->lock)
+ : "=&r" (tmp), "=m" (lock->lock)
: "m"(lock->lock) : "memory");
}
--- /dev/null
+/*
+ * include/asm-alpha/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for alpha EV5 and EV6
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *);
+extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *, unsigned long *);
+
+extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
+ unsigned long *);
+extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
+ unsigned long *, unsigned long *,
+ unsigned long *);
+extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
+ unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+
+asm("
+ .text
+ .align 3
+ .ent xor_alpha_2
+xor_alpha_2:
+ .prologue 0
+ srl $16, 6, $16
+ .align 4
+2:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,8($17)
+ ldq $3,8($18)
+
+ ldq $4,16($17)
+ ldq $5,16($18)
+ ldq $6,24($17)
+ ldq $7,24($18)
+
+ ldq $19,32($17)
+ ldq $20,32($18)
+ ldq $21,40($17)
+ ldq $22,40($18)
+
+ ldq $23,48($17)
+ ldq $24,48($18)
+ ldq $25,56($17)
+ xor $0,$1,$0 # 7 cycles from $1 load
+
+ ldq $27,56($18)
+ xor $2,$3,$2
+ stq $0,0($17)
+ xor $4,$5,$4
+
+ stq $2,8($17)
+ xor $6,$7,$6
+ stq $4,16($17)
+ xor $19,$20,$19
+
+ stq $6,24($17)
+ xor $21,$22,$21
+ stq $19,32($17)
+ xor $23,$24,$23
+
+ stq $21,40($17)
+ xor $25,$27,$25
+ stq $23,48($17)
+ subq $16,1,$16
+
+ stq $25,56($17)
+ addq $17,64,$17
+ addq $18,64,$18
+ bgt $16,2b
+
+ ret
+ .end xor_alpha_2
+
+ .align 3
+ .ent xor_alpha_3
+xor_alpha_3:
+ .prologue 0
+ srl $16, 6, $16
+ .align 4
+3:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,0($19)
+ ldq $3,8($17)
+
+ ldq $4,8($18)
+ ldq $6,16($17)
+ ldq $7,16($18)
+ ldq $21,24($17)
+
+ ldq $22,24($18)
+ ldq $24,32($17)
+ ldq $25,32($18)
+ ldq $5,8($19)
+
+ ldq $20,16($19)
+ ldq $23,24($19)
+ ldq $27,32($19)
+ nop
+
+ xor $0,$1,$1 # 8 cycles from $0 load
+ xor $3,$4,$4 # 6 cycles from $4 load
+ xor $6,$7,$7 # 6 cycles from $7 load
+ xor $21,$22,$22 # 5 cycles from $22 load
+
+ xor $1,$2,$2 # 9 cycles from $2 load
+ xor $24,$25,$25 # 5 cycles from $25 load
+ stq $2,0($17)
+ xor $4,$5,$5 # 6 cycles from $5 load
+
+ stq $5,8($17)
+ xor $7,$20,$20 # 7 cycles from $20 load
+ stq $20,16($17)
+ xor $22,$23,$23 # 7 cycles from $23 load
+
+ stq $23,24($17)
+ xor $25,$27,$27 # 7 cycles from $27 load
+ stq $27,32($17)
+ nop
+
+ ldq $0,40($17)
+ ldq $1,40($18)
+ ldq $3,48($17)
+ ldq $4,48($18)
+
+ ldq $6,56($17)
+ ldq $7,56($18)
+ ldq $2,40($19)
+ ldq $5,48($19)
+
+ ldq $20,56($19)
+ xor $0,$1,$1 # 4 cycles from $1 load
+ xor $3,$4,$4 # 5 cycles from $4 load
+ xor $6,$7,$7 # 5 cycles from $7 load
+
+ xor $1,$2,$2 # 4 cycles from $2 load
+ xor $4,$5,$5 # 5 cycles from $5 load
+ stq $2,40($17)
+ xor $7,$20,$20 # 4 cycles from $20 load
+
+ stq $5,48($17)
+ subq $16,1,$16
+ stq $20,56($17)
+ addq $19,64,$19
+
+ addq $18,64,$18
+ addq $17,64,$17
+ bgt $16,3b
+ ret
+ .end xor_alpha_3
+
+ .align 3
+ .ent xor_alpha_4
+xor_alpha_4:
+ .prologue 0
+ srl $16, 6, $16
+ .align 4
+4:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,0($19)
+ ldq $3,0($20)
+
+ ldq $4,8($17)
+ ldq $5,8($18)
+ ldq $6,8($19)
+ ldq $7,8($20)
+
+ ldq $21,16($17)
+ ldq $22,16($18)
+ ldq $23,16($19)
+ ldq $24,16($20)
+
+ ldq $25,24($17)
+ xor $0,$1,$1 # 6 cycles from $1 load
+ ldq $27,24($18)
+ xor $2,$3,$3 # 6 cycles from $3 load
+
+ ldq $0,24($19)
+ xor $1,$3,$3
+ ldq $1,24($20)
+ xor $4,$5,$5 # 7 cycles from $5 load
+
+ stq $3,0($17)
+ xor $6,$7,$7
+ xor $21,$22,$22 # 7 cycles from $22 load
+ xor $5,$7,$7
+
+ stq $7,8($17)
+ xor $23,$24,$24 # 7 cycles from $24 load
+ ldq $2,32($17)
+ xor $22,$24,$24
+
+ ldq $3,32($18)
+ ldq $4,32($19)
+ ldq $5,32($20)
+ xor $25,$27,$27 # 8 cycles from $27 load
+
+ ldq $6,40($17)
+ ldq $7,40($18)
+ ldq $21,40($19)
+ ldq $22,40($20)
+
+ stq $24,16($17)
+ xor $0,$1,$1 # 9 cycles from $1 load
+ xor $2,$3,$3 # 5 cycles from $3 load
+ xor $27,$1,$1
+
+ stq $1,24($17)
+ xor $4,$5,$5 # 5 cycles from $5 load
+ ldq $23,48($17)
+ ldq $24,48($18)
+
+ ldq $25,48($19)
+ xor $3,$5,$5
+ ldq $27,48($20)
+ ldq $0,56($17)
+
+ ldq $1,56($18)
+ ldq $2,56($19)
+ xor $6,$7,$7 # 8 cycles from $6 load
+ ldq $3,56($20)
+
+ stq $5,32($17)
+ xor $21,$22,$22 # 8 cycles from $22 load
+ xor $7,$22,$22
+ xor $23,$24,$24 # 5 cycles from $24 load
+
+ stq $22,40($17)
+ xor $25,$27,$27 # 5 cycles from $27 load
+ xor $24,$27,$27
+ xor $0,$1,$1 # 5 cycles from $1 load
+
+ stq $27,48($17)
+ xor $2,$3,$3 # 4 cycles from $3 load
+ xor $1,$3,$3
+ subq $16,1,$16
+
+ stq $3,56($17)
+ addq $20,64,$20
+ addq $19,64,$19
+ addq $18,64,$18
+
+ addq $17,64,$17
+ bgt $16,4b
+ ret
+ .end xor_alpha_4
+
+ .align 3
+ .ent xor_alpha_5
+xor_alpha_5:
+ .prologue 0
+ srl $16, 6, $16
+ .align 4
+5:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,0($19)
+ ldq $3,0($20)
+
+ ldq $4,0($21)
+ ldq $5,8($17)
+ ldq $6,8($18)
+ ldq $7,8($19)
+
+ ldq $22,8($20)
+ ldq $23,8($21)
+ ldq $24,16($17)
+ ldq $25,16($18)
+
+ ldq $27,16($19)
+ xor $0,$1,$1 # 6 cycles from $1 load
+ ldq $28,16($20)
+ xor $2,$3,$3 # 6 cycles from $3 load
+
+ ldq $0,16($21)
+ xor $1,$3,$3
+ ldq $1,24($17)
+ xor $3,$4,$4 # 7 cycles from $4 load
+
+ stq $4,0($17)
+ xor $5,$6,$6 # 7 cycles from $6 load
+ xor $7,$22,$22 # 7 cycles from $22 load
+ xor $6,$23,$23 # 7 cycles from $23 load
+
+ ldq $2,24($18)
+ xor $22,$23,$23
+ ldq $3,24($19)
+ xor $24,$25,$25 # 8 cycles from $25 load
+
+ stq $23,8($17)
+ xor $25,$27,$27 # 8 cycles from $27 load
+ ldq $4,24($20)
+ xor $28,$0,$0 # 7 cycles from $0 load
+
+ ldq $5,24($21)
+ xor $27,$0,$0
+ ldq $6,32($17)
+ ldq $7,32($18)
+
+ stq $0,16($17)
+ xor $1,$2,$2 # 6 cycles from $2 load
+ ldq $22,32($19)
+ xor $3,$4,$4 # 4 cycles from $4 load
+
+ ldq $23,32($20)
+ xor $2,$4,$4
+ ldq $24,32($21)
+ ldq $25,40($17)
+
+ ldq $27,40($18)
+ ldq $28,40($19)
+ ldq $0,40($20)
+ xor $4,$5,$5 # 7 cycles from $5 load
+
+ stq $5,24($17)
+ xor $6,$7,$7 # 7 cycles from $7 load
+ ldq $1,40($21)
+ ldq $2,48($17)
+
+ ldq $3,48($18)
+ xor $7,$22,$22 # 7 cycles from $22 load
+ ldq $4,48($19)
+ xor $23,$24,$24 # 6 cycles from $24 load
+
+ ldq $5,48($20)
+ xor $22,$24,$24
+ ldq $6,48($21)
+ xor $25,$27,$27 # 7 cycles from $27 load
+
+ stq $24,32($17)
+ xor $27,$28,$28 # 8 cycles from $28 load
+ ldq $7,56($17)
+ xor $0,$1,$1 # 6 cycles from $1 load
+
+ ldq $22,56($18)
+ ldq $23,56($19)
+ ldq $24,56($20)
+ ldq $25,56($21)
+
+ xor $28,$1,$1
+ xor $2,$3,$3 # 9 cycles from $3 load
+ xor $3,$4,$4 # 9 cycles from $4 load
+ xor $5,$6,$6 # 8 cycles from $6 load
+
+ stq $1,40($17)
+ xor $4,$6,$6
+ xor $7,$22,$22 # 7 cycles from $22 load
+ xor $23,$24,$24 # 6 cycles from $24 load
+
+ stq $6,48($17)
+ xor $22,$24,$24
+ subq $16,1,$16
+ xor $24,$25,$25 # 8 cycles from $25 load
+
+ stq $25,56($17)
+ addq $21,64,$21
+ addq $20,64,$20
+ addq $19,64,$19
+
+ addq $18,64,$18
+ addq $17,64,$17
+ bgt $16,5b
+ ret
+ .end xor_alpha_5
+
+ .align 3
+ .ent xor_alpha_prefetch_2
+xor_alpha_prefetch_2:
+ .prologue 0
+ srl $16, 6, $16
+
+ ldq $31, 0($17)
+ ldq $31, 0($18)
+
+ ldq $31, 64($17)
+ ldq $31, 64($18)
+
+ ldq $31, 128($17)
+ ldq $31, 128($18)
+
+ ldq $31, 192($17)
+ ldq $31, 192($18)
+ .align 4
+2:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,8($17)
+ ldq $3,8($18)
+
+ ldq $4,16($17)
+ ldq $5,16($18)
+ ldq $6,24($17)
+ ldq $7,24($18)
+
+ ldq $19,32($17)
+ ldq $20,32($18)
+ ldq $21,40($17)
+ ldq $22,40($18)
+
+ ldq $23,48($17)
+ ldq $24,48($18)
+ ldq $25,56($17)
+ ldq $27,56($18)
+
+ ldq $31,256($17)
+ xor $0,$1,$0 # 8 cycles from $1 load
+ ldq $31,256($18)
+ xor $2,$3,$2
+
+ stq $0,0($17)
+ xor $4,$5,$4
+ stq $2,8($17)
+ xor $6,$7,$6
+
+ stq $4,16($17)
+ xor $19,$20,$19
+ stq $6,24($17)
+ xor $21,$22,$21
+
+ stq $19,32($17)
+ xor $23,$24,$23
+ stq $21,40($17)
+ xor $25,$27,$25
+
+ stq $23,48($17)
+ subq $16,1,$16
+ stq $25,56($17)
+ addq $17,64,$17
+
+ addq $18,64,$18
+ bgt $16,2b
+ ret
+ .end xor_alpha_prefetch_2
+
+ .align 3
+ .ent xor_alpha_prefetch_3
+xor_alpha_prefetch_3:
+ .prologue 0
+ srl $16, 6, $16
+
+ ldq $31, 0($17)
+ ldq $31, 0($18)
+ ldq $31, 0($19)
+
+ ldq $31, 64($17)
+ ldq $31, 64($18)
+ ldq $31, 64($19)
+
+ ldq $31, 128($17)
+ ldq $31, 128($18)
+ ldq $31, 128($19)
+
+ ldq $31, 192($17)
+ ldq $31, 192($18)
+ ldq $31, 192($19)
+ .align 4
+3:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,0($19)
+ ldq $3,8($17)
+
+ ldq $4,8($18)
+ ldq $6,16($17)
+ ldq $7,16($18)
+ ldq $21,24($17)
+
+ ldq $22,24($18)
+ ldq $24,32($17)
+ ldq $25,32($18)
+ ldq $5,8($19)
+
+ ldq $20,16($19)
+ ldq $23,24($19)
+ ldq $27,32($19)
+ nop
+
+ xor $0,$1,$1 # 8 cycles from $0 load
+ xor $3,$4,$4 # 7 cycles from $4 load
+ xor $6,$7,$7 # 6 cycles from $7 load
+ xor $21,$22,$22 # 5 cycles from $22 load
+
+ xor $1,$2,$2 # 9 cycles from $2 load
+ xor $24,$25,$25 # 5 cycles from $25 load
+ stq $2,0($17)
+ xor $4,$5,$5 # 6 cycles from $5 load
+
+ stq $5,8($17)
+ xor $7,$20,$20 # 7 cycles from $20 load
+ stq $20,16($17)
+ xor $22,$23,$23 # 7 cycles from $23 load
+
+ stq $23,24($17)
+ xor $25,$27,$27 # 7 cycles from $27 load
+ stq $27,32($17)
+ nop
+
+ ldq $0,40($17)
+ ldq $1,40($18)
+ ldq $3,48($17)
+ ldq $4,48($18)
+
+ ldq $6,56($17)
+ ldq $7,56($18)
+ ldq $2,40($19)
+ ldq $5,48($19)
+
+ ldq $20,56($19)
+ ldq $31,256($17)
+ ldq $31,256($18)
+ ldq $31,256($19)
+
+ xor $0,$1,$1 # 6 cycles from $1 load
+ xor $3,$4,$4 # 5 cycles from $4 load
+ xor $6,$7,$7 # 5 cycles from $7 load
+ xor $1,$2,$2 # 4 cycles from $2 load
+
+ xor $4,$5,$5 # 5 cycles from $5 load
+ xor $7,$20,$20 # 4 cycles from $20 load
+ stq $2,40($17)
+ subq $16,1,$16
+
+ stq $5,48($17)
+ addq $19,64,$19
+ stq $20,56($17)
+ addq $18,64,$18
+
+ addq $17,64,$17
+ bgt $16,3b
+ ret
+ .end xor_alpha_prefetch_3
+
+ .align 3
+ .ent xor_alpha_prefetch_4
+xor_alpha_prefetch_4:
+ .prologue 0
+ srl $16, 6, $16
+
+ ldq $31, 0($17)
+ ldq $31, 0($18)
+ ldq $31, 0($19)
+ ldq $31, 0($20)
+
+ ldq $31, 64($17)
+ ldq $31, 64($18)
+ ldq $31, 64($19)
+ ldq $31, 64($20)
+
+ ldq $31, 128($17)
+ ldq $31, 128($18)
+ ldq $31, 128($19)
+ ldq $31, 128($20)
+
+ ldq $31, 192($17)
+ ldq $31, 192($18)
+ ldq $31, 192($19)
+ ldq $31, 192($20)
+ .align 4
+4:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,0($19)
+ ldq $3,0($20)
+
+ ldq $4,8($17)
+ ldq $5,8($18)
+ ldq $6,8($19)
+ ldq $7,8($20)
+
+ ldq $21,16($17)
+ ldq $22,16($18)
+ ldq $23,16($19)
+ ldq $24,16($20)
+
+ ldq $25,24($17)
+ xor $0,$1,$1 # 6 cycles from $1 load
+ ldq $27,24($18)
+ xor $2,$3,$3 # 6 cycles from $3 load
+
+ ldq $0,24($19)
+ xor $1,$3,$3
+ ldq $1,24($20)
+ xor $4,$5,$5 # 7 cycles from $5 load
+
+ stq $3,0($17)
+ xor $6,$7,$7
+ xor $21,$22,$22 # 7 cycles from $22 load
+ xor $5,$7,$7
+
+ stq $7,8($17)
+ xor $23,$24,$24 # 7 cycles from $24 load
+ ldq $2,32($17)
+ xor $22,$24,$24
+
+ ldq $3,32($18)
+ ldq $4,32($19)
+ ldq $5,32($20)
+ xor $25,$27,$27 # 8 cycles from $27 load
+
+ ldq $6,40($17)
+ ldq $7,40($18)
+ ldq $21,40($19)
+ ldq $22,40($20)
+
+ stq $24,16($17)
+ xor $0,$1,$1 # 9 cycles from $1 load
+ xor $2,$3,$3 # 5 cycles from $3 load
+ xor $27,$1,$1
+
+ stq $1,24($17)
+ xor $4,$5,$5 # 5 cycles from $5 load
+ ldq $23,48($17)
+ xor $3,$5,$5
+
+ ldq $24,48($18)
+ ldq $25,48($19)
+ ldq $27,48($20)
+ ldq $0,56($17)
+
+ ldq $1,56($18)
+ ldq $2,56($19)
+ ldq $3,56($20)
+ xor $6,$7,$7 # 8 cycles from $6 load
+
+ ldq $31,256($17)
+ xor $21,$22,$22 # 8 cycles from $22 load
+ ldq $31,256($18)
+ xor $7,$22,$22
+
+ ldq $31,256($19)
+ xor $23,$24,$24 # 6 cycles from $24 load
+ ldq $31,256($20)
+ xor $25,$27,$27 # 6 cycles from $27 load
+
+ stq $5,32($17)
+ xor $24,$27,$27
+ xor $0,$1,$1 # 7 cycles from $1 load
+ xor $2,$3,$3 # 6 cycles from $3 load
+
+ stq $22,40($17)
+ xor $1,$3,$3
+ stq $27,48($17)
+ subq $16,1,$16
+
+ stq $3,56($17)
+ addq $20,64,$20
+ addq $19,64,$19
+ addq $18,64,$18
+
+ addq $17,64,$17
+ bgt $16,4b
+ ret
+ .end xor_alpha_prefetch_4
+
+ .align 3
+ .ent xor_alpha_prefetch_5
+xor_alpha_prefetch_5:
+ .prologue 0
+ srl $16, 6, $16
+
+ ldq $31, 0($17)
+ ldq $31, 0($18)
+ ldq $31, 0($19)
+ ldq $31, 0($20)
+ ldq $31, 0($21)
+
+ ldq $31, 64($17)
+ ldq $31, 64($18)
+ ldq $31, 64($19)
+ ldq $31, 64($20)
+ ldq $31, 64($21)
+
+ ldq $31, 128($17)
+ ldq $31, 128($18)
+ ldq $31, 128($19)
+ ldq $31, 128($20)
+ ldq $31, 128($21)
+
+ ldq $31, 192($17)
+ ldq $31, 192($18)
+ ldq $31, 192($19)
+ ldq $31, 192($20)
+ ldq $31, 192($21)
+ .align 4
+5:
+ ldq $0,0($17)
+ ldq $1,0($18)
+ ldq $2,0($19)
+ ldq $3,0($20)
+
+ ldq $4,0($21)
+ ldq $5,8($17)
+ ldq $6,8($18)
+ ldq $7,8($19)
+
+ ldq $22,8($20)
+ ldq $23,8($21)
+ ldq $24,16($17)
+ ldq $25,16($18)
+
+ ldq $27,16($19)
+ xor $0,$1,$1 # 6 cycles from $1 load
+ ldq $28,16($20)
+ xor $2,$3,$3 # 6 cycles from $3 load
+
+ ldq $0,16($21)
+ xor $1,$3,$3
+ ldq $1,24($17)
+ xor $3,$4,$4 # 7 cycles from $4 load
+
+ stq $4,0($17)
+ xor $5,$6,$6 # 7 cycles from $6 load
+ xor $7,$22,$22 # 7 cycles from $22 load
+ xor $6,$23,$23 # 7 cycles from $23 load
+
+ ldq $2,24($18)
+ xor $22,$23,$23
+ ldq $3,24($19)
+ xor $24,$25,$25 # 8 cycles from $25 load
+
+ stq $23,8($17)
+ xor $25,$27,$27 # 8 cycles from $27 load
+ ldq $4,24($20)
+ xor $28,$0,$0 # 7 cycles from $0 load
+
+ ldq $5,24($21)
+ xor $27,$0,$0
+ ldq $6,32($17)
+ ldq $7,32($18)
+
+ stq $0,16($17)
+ xor $1,$2,$2 # 6 cycles from $2 load
+ ldq $22,32($19)
+ xor $3,$4,$4 # 4 cycles from $4 load
+
+ ldq $23,32($20)
+ xor $2,$4,$4
+ ldq $24,32($21)
+ ldq $25,40($17)
+
+ ldq $27,40($18)
+ ldq $28,40($19)
+ ldq $0,40($20)
+ xor $4,$5,$5 # 7 cycles from $5 load
+
+ stq $5,24($17)
+ xor $6,$7,$7 # 7 cycles from $7 load
+ ldq $1,40($21)
+ ldq $2,48($17)
+
+ ldq $3,48($18)
+ xor $7,$22,$22 # 7 cycles from $22 load
+ ldq $4,48($19)
+ xor $23,$24,$24 # 6 cycles from $24 load
+
+ ldq $5,48($20)
+ xor $22,$24,$24
+ ldq $6,48($21)
+ xor $25,$27,$27 # 7 cycles from $27 load
+
+ stq $24,32($17)
+ xor $27,$28,$28 # 8 cycles from $28 load
+ ldq $7,56($17)
+ xor $0,$1,$1 # 6 cycles from $1 load
+
+ ldq $22,56($18)
+ ldq $23,56($19)
+ ldq $24,56($20)
+ ldq $25,56($21)
+
+ ldq $31,256($17)
+ xor $28,$1,$1
+ ldq $31,256($18)
+ xor $2,$3,$3 # 9 cycles from $3 load
+
+ ldq $31,256($19)
+ xor $3,$4,$4 # 9 cycles from $4 load
+ ldq $31,256($20)
+ xor $5,$6,$6 # 8 cycles from $6 load
+
+ stq $1,40($17)
+ xor $4,$6,$6
+ xor $7,$22,$22 # 7 cycles from $22 load
+ xor $23,$24,$24 # 6 cycles from $24 load
+
+ stq $6,48($17)
+ xor $22,$24,$24
+ ldq $31,256($21)
+ xor $24,$25,$25 # 8 cycles from $25 load
+
+ stq $25,56($17)
+ subq $16,1,$16
+ addq $21,64,$21
+ addq $20,64,$20
+
+ addq $19,64,$19
+ addq $18,64,$18
+ addq $17,64,$17
+ bgt $16,5b
+
+ ret
+ .end xor_alpha_prefetch_5
+");
+
+static struct xor_block_template xor_block_alpha = {
+ name: "alpha",
+ do_2: xor_alpha_2,
+ do_3: xor_alpha_3,
+ do_4: xor_alpha_4,
+ do_5: xor_alpha_5,
+};
+
+static struct xor_block_template xor_block_alpha_prefetch = {
+ name: "alpha prefetch",
+ do_2: xor_alpha_prefetch_2,
+ do_3: xor_alpha_prefetch_3,
+ do_4: xor_alpha_prefetch_4,
+ do_5: xor_alpha_prefetch_5,
+};
+
+/* For grins, also test the generic routines. */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+ do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_alpha); \
+ xor_speed(&xor_block_alpha_prefetch); \
+ } while (0)
+
+/* Force the use of alpha_prefetch if EV6, as it is significantly
+ faster in the cold cache case. */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ (implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+/*
+ * include/asm-generic/xor.h
+ *
+ * Generic optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+static void
+xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ p1[0] ^= p2[0];
+ p1[1] ^= p2[1];
+ p1[2] ^= p2[2];
+ p1[3] ^= p2[3];
+ p1[4] ^= p2[4];
+ p1[5] ^= p2[5];
+ p1[6] ^= p2[6];
+ p1[7] ^= p2[7];
+ p1 += 8;
+ p2 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ p1[0] ^= p2[0] ^ p3[0];
+ p1[1] ^= p2[1] ^ p3[1];
+ p1[2] ^= p2[2] ^ p3[2];
+ p1[3] ^= p2[3] ^ p3[3];
+ p1[4] ^= p2[4] ^ p3[4];
+ p1[5] ^= p2[5] ^ p3[5];
+ p1[6] ^= p2[6] ^ p3[6];
+ p1[7] ^= p2[7] ^ p3[7];
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
+ p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
+ p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
+ p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
+ p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
+ p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
+ p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
+ p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ p4 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
+ p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
+ p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
+ p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
+ p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
+ p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
+ p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
+ p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ p4 += 8;
+ p5 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ register long d0, d1, d2, d3, d4, d5, d6, d7;
+ d0 = p1[0]; /* Pull the stuff into registers */
+ d1 = p1[1]; /* ... in bursts, if possible. */
+ d2 = p1[2];
+ d3 = p1[3];
+ d4 = p1[4];
+ d5 = p1[5];
+ d6 = p1[6];
+ d7 = p1[7];
+ d0 ^= p2[0];
+ d1 ^= p2[1];
+ d2 ^= p2[2];
+ d3 ^= p2[3];
+ d4 ^= p2[4];
+ d5 ^= p2[5];
+ d6 ^= p2[6];
+ d7 ^= p2[7];
+ p1[0] = d0; /* Store the result (in burts) */
+ p1[1] = d1;
+ p1[2] = d2;
+ p1[3] = d3;
+ p1[4] = d4;
+ p1[5] = d5;
+ p1[6] = d6;
+ p1[7] = d7;
+ p1 += 8;
+ p2 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ register long d0, d1, d2, d3, d4, d5, d6, d7;
+ d0 = p1[0]; /* Pull the stuff into registers */
+ d1 = p1[1]; /* ... in bursts, if possible. */
+ d2 = p1[2];
+ d3 = p1[3];
+ d4 = p1[4];
+ d5 = p1[5];
+ d6 = p1[6];
+ d7 = p1[7];
+ d0 ^= p2[0];
+ d1 ^= p2[1];
+ d2 ^= p2[2];
+ d3 ^= p2[3];
+ d4 ^= p2[4];
+ d5 ^= p2[5];
+ d6 ^= p2[6];
+ d7 ^= p2[7];
+ d0 ^= p3[0];
+ d1 ^= p3[1];
+ d2 ^= p3[2];
+ d3 ^= p3[3];
+ d4 ^= p3[4];
+ d5 ^= p3[5];
+ d6 ^= p3[6];
+ d7 ^= p3[7];
+ p1[0] = d0; /* Store the result (in burts) */
+ p1[1] = d1;
+ p1[2] = d2;
+ p1[3] = d3;
+ p1[4] = d4;
+ p1[5] = d5;
+ p1[6] = d6;
+ p1[7] = d7;
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ register long d0, d1, d2, d3, d4, d5, d6, d7;
+ d0 = p1[0]; /* Pull the stuff into registers */
+ d1 = p1[1]; /* ... in bursts, if possible. */
+ d2 = p1[2];
+ d3 = p1[3];
+ d4 = p1[4];
+ d5 = p1[5];
+ d6 = p1[6];
+ d7 = p1[7];
+ d0 ^= p2[0];
+ d1 ^= p2[1];
+ d2 ^= p2[2];
+ d3 ^= p2[3];
+ d4 ^= p2[4];
+ d5 ^= p2[5];
+ d6 ^= p2[6];
+ d7 ^= p2[7];
+ d0 ^= p3[0];
+ d1 ^= p3[1];
+ d2 ^= p3[2];
+ d3 ^= p3[3];
+ d4 ^= p3[4];
+ d5 ^= p3[5];
+ d6 ^= p3[6];
+ d7 ^= p3[7];
+ d0 ^= p4[0];
+ d1 ^= p4[1];
+ d2 ^= p4[2];
+ d3 ^= p4[3];
+ d4 ^= p4[4];
+ d5 ^= p4[5];
+ d6 ^= p4[6];
+ d7 ^= p4[7];
+ p1[0] = d0; /* Store the result (in burts) */
+ p1[1] = d1;
+ p1[2] = d2;
+ p1[3] = d3;
+ p1[4] = d4;
+ p1[5] = d5;
+ p1[6] = d6;
+ p1[7] = d7;
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ p4 += 8;
+ } while (--lines > 0);
+}
+
+static void
+xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ long lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ register long d0, d1, d2, d3, d4, d5, d6, d7;
+ d0 = p1[0]; /* Pull the stuff into registers */
+ d1 = p1[1]; /* ... in bursts, if possible. */
+ d2 = p1[2];
+ d3 = p1[3];
+ d4 = p1[4];
+ d5 = p1[5];
+ d6 = p1[6];
+ d7 = p1[7];
+ d0 ^= p2[0];
+ d1 ^= p2[1];
+ d2 ^= p2[2];
+ d3 ^= p2[3];
+ d4 ^= p2[4];
+ d5 ^= p2[5];
+ d6 ^= p2[6];
+ d7 ^= p2[7];
+ d0 ^= p3[0];
+ d1 ^= p3[1];
+ d2 ^= p3[2];
+ d3 ^= p3[3];
+ d4 ^= p3[4];
+ d5 ^= p3[5];
+ d6 ^= p3[6];
+ d7 ^= p3[7];
+ d0 ^= p4[0];
+ d1 ^= p4[1];
+ d2 ^= p4[2];
+ d3 ^= p4[3];
+ d4 ^= p4[4];
+ d5 ^= p4[5];
+ d6 ^= p4[6];
+ d7 ^= p4[7];
+ d0 ^= p5[0];
+ d1 ^= p5[1];
+ d2 ^= p5[2];
+ d3 ^= p5[3];
+ d4 ^= p5[4];
+ d5 ^= p5[5];
+ d6 ^= p5[6];
+ d7 ^= p5[7];
+ p1[0] = d0; /* Store the result (in burts) */
+ p1[1] = d1;
+ p1[2] = d2;
+ p1[3] = d3;
+ p1[4] = d4;
+ p1[5] = d5;
+ p1[6] = d6;
+ p1[7] = d7;
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ p4 += 8;
+ p5 += 8;
+ } while (--lines > 0);
+}
+
+static struct xor_block_template xor_block_8regs = {
+ name: "8regs",
+ do_2: xor_8regs_2,
+ do_3: xor_8regs_3,
+ do_4: xor_8regs_4,
+ do_5: xor_8regs_5,
+};
+
+static struct xor_block_template xor_block_32regs = {
+ name: "32regs",
+ do_2: xor_32regs_2,
+ do_3: xor_32regs_3,
+ do_4: xor_32regs_4,
+ do_5: xor_32regs_5,
+};
+
+#define XOR_TRY_TEMPLATES \
+ do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_32regs); \
+ } while (0)
--- /dev/null
+/*
+ * include/asm-i386/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+#define FPU_SAVE \
+ do { \
+ if (!(current->flags & PF_USEDFPU)) \
+ __asm__ __volatile__ (" clts;\n"); \
+ __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
+ } while (0)
+
+#define FPU_RESTORE \
+ do { \
+ __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
+ if (!(current->flags & PF_USEDFPU)) \
+ stts(); \
+ } while (0)
+
+#define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
+#define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
+#define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
+#define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
+#define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
+#define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
+
+
+static void
+xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 7;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ XO1(i,0) \
+ ST(i,0) \
+ XO1(i+1,1) \
+ ST(i+1,1) \
+ XO1(i+2,2) \
+ ST(i+2,2) \
+ XO1(i+3,3) \
+ ST(i+3,3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 7;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ XO2(i,0) \
+ ST(i,0) \
+ XO2(i+1,1) \
+ ST(i+1,1) \
+ XO2(i+2,2) \
+ ST(i+2,2) \
+ XO2(i+3,3) \
+ ST(i+3,3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2), "r" (p3)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 7;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ XO2(i,0) \
+ XO2(i+1,1) \
+ XO2(i+2,2) \
+ XO2(i+3,3) \
+ XO3(i,0) \
+ ST(i,0) \
+ XO3(i+1,1) \
+ ST(i+1,1) \
+ XO3(i+2,2) \
+ ST(i+2,2) \
+ XO3(i+3,3) \
+ ST(i+3,3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 7;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ XO2(i,0) \
+ XO2(i+1,1) \
+ XO2(i+2,2) \
+ XO2(i+3,3) \
+ XO3(i,0) \
+ XO3(i+1,1) \
+ XO3(i+2,2) \
+ XO3(i+3,3) \
+ XO4(i,0) \
+ ST(i,0) \
+ XO4(i+1,1) \
+ ST(i+1,1) \
+ XO4(i+2,2) \
+ ST(i+2,2) \
+ XO4(i+3,3) \
+ ST(i+3,3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " addl $128, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "g" (lines),
+ "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+static void
+xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 6;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+ " .align 32 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 6;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2), "r" (p3)
+ : "memory" );
+
+ FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 6;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 6;
+ char fpu_save[108];
+
+ FPU_SAVE;
+
+ __asm__ __volatile__ (
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor (%5), %%mm0 ;\n"
+ " pxor 8(%5), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%5), %%mm2 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%5), %%mm3 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 32(%5), %%mm4 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " pxor 40(%5), %%mm5 ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%5), %%mm6 ;\n"
+ " pxor 56(%5), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " addl $64, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "g" (lines),
+ "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+ : "memory");
+
+ FPU_RESTORE;
+}
+
+static struct xor_block_template xor_block_pII_mmx = {
+ name: "pII_mmx",
+ do_2: xor_pII_mmx_2,
+ do_3: xor_pII_mmx_3,
+ do_4: xor_pII_mmx_4,
+ do_5: xor_pII_mmx_5,
+};
+
+static struct xor_block_template xor_block_p5_mmx = {
+ name: "p5_mmx",
+ do_2: xor_p5_mmx_2,
+ do_3: xor_p5_mmx_3,
+ do_4: xor_p5_mmx_4,
+ do_5: xor_p5_mmx_5,
+};
+
+#undef FPU_SAVE
+#undef FPU_RESTORE
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+#define XMMS_SAVE \
+ __asm__ __volatile__ ( \
+ "movl %%cr0,%0 ;\n\t" \
+ "clts ;\n\t" \
+ "movups %%xmm0,(%1) ;\n\t" \
+ "movups %%xmm1,0x10(%1) ;\n\t" \
+ "movups %%xmm2,0x20(%1) ;\n\t" \
+ "movups %%xmm3,0x30(%1) ;\n\t" \
+ : "=r" (cr0) \
+ : "r" (xmm_save) \
+ : "memory")
+
+#define XMMS_RESTORE \
+ __asm__ __volatile__ ( \
+ "sfence ;\n\t" \
+ "movups (%1),%%xmm0 ;\n\t" \
+ "movups 0x10(%1),%%xmm1 ;\n\t" \
+ "movups 0x20(%1),%%xmm2 ;\n\t" \
+ "movups 0x30(%1),%%xmm3 ;\n\t" \
+ "movl %0,%%cr0 ;\n\t" \
+ : \
+ : "r" (cr0), "r" (xmm_save) \
+ : "memory")
+
+#define OFFS(x) "16*("#x")"
+#define PF0(x) " prefetcht0 "OFFS(x)"(%1) ;\n"
+#define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
+#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
+#define PF1(x) " prefetchnta "OFFS(x)"(%2) ;\n"
+#define PF2(x) " prefetchnta "OFFS(x)"(%3) ;\n"
+#define PF3(x) " prefetchnta "OFFS(x)"(%4) ;\n"
+#define PF4(x) " prefetchnta "OFFS(x)"(%5) ;\n"
+#define PF5(x) " prefetchnta "OFFS(x)"(%6) ;\n"
+#define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
+#define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
+#define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
+#define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
+#define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4];
+ int cr0;
+
+ XMMS_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i,0) \
+ LD(i+1,1) \
+ PF1(i) \
+ PF1(i+2) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ PF0(i+4) \
+ PF0(i+6) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ ST(i,0) \
+ ST(i+1,1) \
+ ST(i+2,2) \
+ ST(i+3,3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4];
+ int cr0;
+
+ XMMS_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i+2) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ PF2(i) \
+ PF2(i+2) \
+ PF0(i+4) \
+ PF0(i+6) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ XO2(i,0) \
+ XO2(i+1,1) \
+ XO2(i+2,2) \
+ XO2(i+3,3) \
+ ST(i,0) \
+ ST(i+1,1) \
+ ST(i+2,2) \
+ ST(i+3,3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r"(p2), "r"(p3)
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4];
+ int cr0;
+
+ XMMS_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i+2) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ PF2(i) \
+ PF2(i+2) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ PF3(i) \
+ PF3(i+2) \
+ PF0(i+4) \
+ PF0(i+6) \
+ XO2(i,0) \
+ XO2(i+1,1) \
+ XO2(i+2,2) \
+ XO2(i+3,3) \
+ XO3(i,0) \
+ XO3(i+1,1) \
+ XO3(i+2,2) \
+ XO3(i+3,3) \
+ ST(i,0) \
+ ST(i+1,1) \
+ ST(i+2,2) \
+ ST(i+3,3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " addl $256, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4];
+ int cr0;
+
+ XMMS_SAVE;
+
+ __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i+2) \
+ LD(i,0) \
+ LD(i+1,1) \
+ LD(i+2,2) \
+ LD(i+3,3) \
+ PF2(i) \
+ PF2(i+2) \
+ XO1(i,0) \
+ XO1(i+1,1) \
+ XO1(i+2,2) \
+ XO1(i+3,3) \
+ PF3(i) \
+ PF3(i+2) \
+ XO2(i,0) \
+ XO2(i+1,1) \
+ XO2(i+2,2) \
+ XO2(i+3,3) \
+ PF4(i) \
+ PF4(i+2) \
+ PF0(i+4) \
+ PF0(i+6) \
+ XO3(i,0) \
+ XO3(i+1,1) \
+ XO3(i+2,2) \
+ XO3(i+3,3) \
+ XO4(i,0) \
+ XO4(i+1,1) \
+ XO4(i+2,2) \
+ XO4(i+3,3) \
+ ST(i,0) \
+ ST(i+1,1) \
+ ST(i+2,2) \
+ ST(i+3,3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " addl $256, %4 ;\n"
+ " addl $256, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ :
+ : "r" (lines),
+ "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_pIII_sse = {
+ name: "pIII_sse",
+ do_2: xor_sse_2,
+ do_3: xor_sse_3,
+ do_4: xor_sse_4,
+ do_5: xor_sse_5,
+};
+
+/* Also try the generic routines. */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+ do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_32regs); \
+ if (cpu_has_xmm) \
+ xor_speed(&xor_block_pIII_sse); \
+ if (md_cpu_has_mmx()) { \
+ xor_speed(&xor_block_pII_mmx); \
+ xor_speed(&xor_block_p5_mmx); \
+ } \
+ } while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
--- /dev/null
+/*
+ * include/asm-ia64/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for IA-64.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *);
+extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *, unsigned long *);
+
+asm ("
+ .text
+
+ // Assume L2 memory latency of 6 cycles.
+
+ .proc xor_ia64_2
+xor_ia64_2:
+ .prologue
+ .fframe 0
+ { .mii
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 3, 0, 13, 16
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ }
+ .body
+ { .mii
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ }
+ { .mmi
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ }
+ { .mii
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ ;;
+ }
+ .rotr s1[6+1], s2[6+1], d[2]
+ .rotp p[6+2]
+0: { .mmi
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+ }
+ { .mfb
+(p[6+1]) st8.nta [r8] = d[1], 8
+ nop.f 0
+ br.ctop.dptk.few 0b
+ ;;
+ }
+ { .mii
+ mov ar.lc = r30
+ mov pr = r29, -1
+ }
+ { .bbb
+ br.ret.sptk.few rp
+ }
+ .endp xor_ia64_2
+
+ .proc xor_ia64_3
+xor_ia64_3:
+ .prologue
+ .fframe 0
+ { .mii
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 4, 0, 20, 24
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ }
+ .body
+ { .mii
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ }
+ { .mmi
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ }
+ { .mii
+ mov r18 = in3
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ ;;
+ }
+ .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
+ .rotp p[6+2]
+0: { .mmi
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+ ;;
+ }
+ { .mmi
+(p[0]) ld8.nta s3[0] = [r18], 8
+(p[6+1]) st8.nta [r8] = d[1], 8
+(p[6]) xor d[0] = d[0], s3[6]
+ }
+ { .bbb
+ br.ctop.dptk.few 0b
+ ;;
+ }
+ { .mii
+ mov ar.lc = r30
+ mov pr = r29, -1
+ }
+ { .bbb
+ br.ret.sptk.few rp
+ }
+ .endp xor_ia64_3
+
+ .proc xor_ia64_4
+xor_ia64_4:
+ .prologue
+ .fframe 0
+ { .mii
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 5, 0, 27, 32
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ }
+ .body
+ { .mii
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ }
+ { .mmi
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ }
+ { .mii
+ mov r18 = in3
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ }
+ { .mfb
+ mov r19 = in4
+ ;;
+ }
+ .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
+ .rotp p[6+2]
+0: { .mmi
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+ }
+ { .mmi
+(p[0]) ld8.nta s3[0] = [r18], 8
+(p[0]) ld8.nta s4[0] = [r19], 8
+(p[6]) xor r20 = s3[6], s4[6]
+ ;;
+ }
+ { .mib
+(p[6+1]) st8.nta [r8] = d[1], 8
+(p[6]) xor d[0] = d[0], r20
+ br.ctop.dptk.few 0b
+ ;;
+ }
+ { .mii
+ mov ar.lc = r30
+ mov pr = r29, -1
+ }
+ { .bbb
+ br.ret.sptk.few rp
+ }
+ .endp xor_ia64_4
+
+ .proc xor_ia64_5
+xor_ia64_5:
+ .prologue
+ .fframe 0
+ { .mii
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 6, 0, 34, 40
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ }
+ .body
+ { .mii
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ }
+ { .mmi
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ }
+ { .mii
+ mov r18 = in3
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ }
+ { .mib
+ mov r19 = in4
+ mov r20 = in5
+ ;;
+ }
+ .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
+ .rotp p[6+2]
+0: { .mmi
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+ }
+ { .mmi
+(p[0]) ld8.nta s3[0] = [r18], 8
+(p[0]) ld8.nta s4[0] = [r19], 8
+(p[6]) xor r21 = s3[6], s4[6]
+ ;;
+ }
+ { .mmi
+(p[0]) ld8.nta s5[0] = [r20], 8
+(p[6+1]) st8.nta [r8] = d[1], 8
+(p[6]) xor d[0] = d[0], r21
+ ;;
+ }
+ { .mfb
+(p[6]) xor d[0] = d[0], s5[6]
+ nop.f 0
+ br.ctop.dptk.few 0b
+ ;;
+ }
+ { .mii
+ mov ar.lc = r30
+ mov pr = r29, -1
+ }
+ { .bbb
+ br.ret.sptk.few rp
+ }
+ .endp xor_ia64_5
+");
+
+static struct xor_block_template xor_block_ia64 = {
+ name: "ia64",
+ do_2: xor_ia64_2,
+ do_3: xor_ia64_3,
+ do_4: xor_ia64_4,
+ do_5: xor_ia64_5,
+};
+
+#define XOR_TRY_TEMPLATES xor_speed(&xor_block_ia64)
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+#include <asm-generic/xor.h>
--- /dev/null
+/*
+ * include/asm-sparc/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for 32-bit Sparc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * ldd/std SPARC instructions.
+ *
+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+
+static void
+sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ int lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ __asm__ __volatile__("
+ ldd [%0 + 0x00], %%g2
+ ldd [%0 + 0x08], %%g4
+ ldd [%0 + 0x10], %%o0
+ ldd [%0 + 0x18], %%o2
+ ldd [%1 + 0x00], %%o4
+ ldd [%1 + 0x08], %%l0
+ ldd [%1 + 0x10], %%l2
+ ldd [%1 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ std %%g2, [%0 + 0x00]
+ std %%g4, [%0 + 0x08]
+ std %%o0, [%0 + 0x10]
+ std %%o2, [%0 + 0x18]
+ "
+ :
+ : "r" (p1), "r" (p2)
+ : "g2", "g3", "g4", "g5",
+ "o0", "o1", "o2", "o3", "o4", "o5",
+ "l0", "l1", "l2", "l3", "l4", "l5");
+ p1 += 8;
+ p2 += 8;
+ } while (--lines > 0);
+}
+
+static void
+sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ int lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ __asm__ __volatile__("
+ ldd [%0 + 0x00], %%g2
+ ldd [%0 + 0x08], %%g4
+ ldd [%0 + 0x10], %%o0
+ ldd [%0 + 0x18], %%o2
+ ldd [%1 + 0x00], %%o4
+ ldd [%1 + 0x08], %%l0
+ ldd [%1 + 0x10], %%l2
+ ldd [%1 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ ldd [%2 + 0x00], %%o4
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ ldd [%2 + 0x08], %%l0
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ ldd [%2 + 0x10], %%l2
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ ldd [%2 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ std %%g2, [%0 + 0x00]
+ std %%g4, [%0 + 0x08]
+ std %%o0, [%0 + 0x10]
+ std %%o2, [%0 + 0x18]
+ "
+ :
+ : "r" (p1), "r" (p2), "r" (p3)
+ : "g2", "g3", "g4", "g5",
+ "o0", "o1", "o2", "o3", "o4", "o5",
+ "l0", "l1", "l2", "l3", "l4", "l5");
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ } while (--lines > 0);
+}
+
+static void
+sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ int lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ __asm__ __volatile__("
+ ldd [%0 + 0x00], %%g2
+ ldd [%0 + 0x08], %%g4
+ ldd [%0 + 0x10], %%o0
+ ldd [%0 + 0x18], %%o2
+ ldd [%1 + 0x00], %%o4
+ ldd [%1 + 0x08], %%l0
+ ldd [%1 + 0x10], %%l2
+ ldd [%1 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ ldd [%2 + 0x00], %%o4
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ ldd [%2 + 0x08], %%l0
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ ldd [%2 + 0x10], %%l2
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ ldd [%2 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ ldd [%3 + 0x00], %%o4
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ ldd [%3 + 0x08], %%l0
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ ldd [%3 + 0x10], %%l2
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ ldd [%3 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ std %%g2, [%0 + 0x00]
+ std %%g4, [%0 + 0x08]
+ std %%o0, [%0 + 0x10]
+ std %%o2, [%0 + 0x18]
+ "
+ :
+ : "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+ : "g2", "g3", "g4", "g5",
+ "o0", "o1", "o2", "o3", "o4", "o5",
+ "l0", "l1", "l2", "l3", "l4", "l5");
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ p4 += 8;
+ } while (--lines > 0);
+}
+
+static void
+sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ int lines = bytes / (sizeof (long)) / 8;
+
+ do {
+ __asm__ __volatile__("
+ ldd [%0 + 0x00], %%g2
+ ldd [%0 + 0x08], %%g4
+ ldd [%0 + 0x10], %%o0
+ ldd [%0 + 0x18], %%o2
+ ldd [%1 + 0x00], %%o4
+ ldd [%1 + 0x08], %%l0
+ ldd [%1 + 0x10], %%l2
+ ldd [%1 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ ldd [%2 + 0x00], %%o4
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ ldd [%2 + 0x08], %%l0
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ ldd [%2 + 0x10], %%l2
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ ldd [%2 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ ldd [%3 + 0x00], %%o4
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ ldd [%3 + 0x08], %%l0
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ ldd [%3 + 0x10], %%l2
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ ldd [%3 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ ldd [%4 + 0x00], %%o4
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ ldd [%4 + 0x08], %%l0
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ ldd [%4 + 0x10], %%l2
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ ldd [%4 + 0x18], %%l4
+ xor %%g2, %%o4, %%g2
+ xor %%g3, %%o5, %%g3
+ xor %%g4, %%l0, %%g4
+ xor %%g5, %%l1, %%g5
+ xor %%o0, %%l2, %%o0
+ xor %%o1, %%l3, %%o1
+ xor %%o2, %%l4, %%o2
+ xor %%o3, %%l5, %%o3
+ std %%g2, [%0 + 0x00]
+ std %%g4, [%0 + 0x08]
+ std %%o0, [%0 + 0x10]
+ std %%o2, [%0 + 0x18]
+ "
+ :
+ : "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+ : "g2", "g3", "g4", "g5",
+ "o0", "o1", "o2", "o3", "o4", "o5",
+ "l0", "l1", "l2", "l3", "l4", "l5");
+ p1 += 8;
+ p2 += 8;
+ p3 += 8;
+ p4 += 8;
+ p5 += 8;
+ } while (--lines > 0);
+}
+
+static struct xor_block_template xor_block_SPARC = {
+ name: "SPARC",
+ do_2: sparc_2,
+ do_3: sparc_3,
+ do_4: sparc_4,
+ do_5: sparc_5,
+};
+
+/* For grins, also test the generic routines. */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+ do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_SPARC); \
+ } while (0)
--- /dev/null
+/*
+ * include/asm-sparc64/xor.h
+ *
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * UltraSparc Visual Instruction Set.
+ *
+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Requirements:
+ * !(((long)dest | (long)sourceN) & (64 - 1)) &&
+ * !(len & 127) && len >= 256
+ *
+ * It is done in pure assembly, as otherwise gcc makes it a non-leaf
+ * function, which is not what we want.
+ */
+
+#include <asm/pstate.h>
+#include <asm/asi.h>
+
+extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *);
+extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *, unsigned long *);
+
+#define _S(x) __S(x)
+#define __S(x) #x
+#define DEF(x) __asm__(#x " = " _S(x))
+
+DEF(FPRS_FEF);
+DEF(FPRS_DU);
+DEF(ASI_BLK_P);
+
+/* ??? We set and use %asi instead of using ASI_BLK_P directly because gas
+ currently does not accept symbolic constants for the ASI specifier. */
+
+__asm__ ("
+ .text
+ .globl xor_vis_2
+ .type xor_vis_2,@function
+xor_vis_2:
+ rd %fprs, %g1
+ andcc %g1, FPRS_FEF|FPRS_DU, %g0
+ be,pt %icc, 0f
+ sethi %hi(VISenter), %g1
+ jmpl %g1 + %lo(VISenter), %g7
+ add %g7, 8, %g7
+0: wr %g0, FPRS_FEF, %fprs
+ rd %asi, %g1
+ wr %g0, ASI_BLK_P, %asi
+ membar #LoadStore|#StoreLoad|#StoreStore
+ sub %o0, 128, %o0
+ ldda [%o1] %asi, %f0
+ ldda [%o2] %asi, %f16
+
+2: ldda [%o1 + 64] %asi, %f32
+ fxor %f0, %f16, %f16
+ fxor %f2, %f18, %f18
+ fxor %f4, %f20, %f20
+ fxor %f6, %f22, %f22
+ fxor %f8, %f24, %f24
+ fxor %f10, %f26, %f26
+ fxor %f12, %f28, %f28
+ fxor %f14, %f30, %f30
+ stda %f16, [%o1] %asi
+ ldda [%o2 + 64] %asi, %f48
+ ldda [%o1 + 128] %asi, %f0
+ fxor %f32, %f48, %f48
+ fxor %f34, %f50, %f50
+ add %o1, 128, %o1
+ fxor %f36, %f52, %f52
+ add %o2, 128, %o2
+ fxor %f38, %f54, %f54
+ subcc %o0, 128, %o0
+ fxor %f40, %f56, %f56
+ fxor %f42, %f58, %f58
+ fxor %f44, %f60, %f60
+ fxor %f46, %f62, %f62
+ stda %f48, [%o1 - 64] %asi
+ bne,pt %xcc, 2b
+ ldda [%o2] %asi, %f16
+
+ ldda [%o1 + 64] %asi, %f32
+ fxor %f0, %f16, %f16
+ fxor %f2, %f18, %f18
+ fxor %f4, %f20, %f20
+ fxor %f6, %f22, %f22
+ fxor %f8, %f24, %f24
+ fxor %f10, %f26, %f26
+ fxor %f12, %f28, %f28
+ fxor %f14, %f30, %f30
+ stda %f16, [%o1] %asi
+ ldda [%o2 + 64] %asi, %f48
+ membar #Sync
+ fxor %f32, %f48, %f48
+ fxor %f34, %f50, %f50
+ fxor %f36, %f52, %f52
+ fxor %f38, %f54, %f54
+ fxor %f40, %f56, %f56
+ fxor %f42, %f58, %f58
+ fxor %f44, %f60, %f60
+ fxor %f46, %f62, %f62
+ stda %f48, [%o1 + 64] %asi
+ membar #Sync|#StoreStore|#StoreLoad
+ wr %g1, %g0, %asi
+ retl
+ wr %g0, 0, %fprs
+ .size xor_vis_2, .-xor_vis_2
+
+
+ .globl xor_vis_3
+ .type xor_vis_3,@function
+xor_vis_3:
+ rd %fprs, %g1
+ andcc %g1, FPRS_FEF|FPRS_DU, %g0
+ be,pt %icc, 0f
+ sethi %hi(VISenter), %g1
+ jmpl %g1 + %lo(VISenter), %g7
+ add %g7, 8, %g7
+0: wr %g0, FPRS_FEF, %fprs
+ rd %asi, %g1
+ wr %g0, ASI_BLK_P, %asi
+ membar #LoadStore|#StoreLoad|#StoreStore
+ sub %o0, 64, %o0
+ ldda [%o1] %asi, %f0
+ ldda [%o2] %asi, %f16
+
+3: ldda [%o3] %asi, %f32
+ fxor %f0, %f16, %f48
+ fxor %f2, %f18, %f50
+ add %o1, 64, %o1
+ fxor %f4, %f20, %f52
+ fxor %f6, %f22, %f54
+ add %o2, 64, %o2
+ fxor %f8, %f24, %f56
+ fxor %f10, %f26, %f58
+ fxor %f12, %f28, %f60
+ fxor %f14, %f30, %f62
+ ldda [%o1] %asi, %f0
+ fxor %f48, %f32, %f48
+ fxor %f50, %f34, %f50
+ fxor %f52, %f36, %f52
+ fxor %f54, %f38, %f54
+ add %o3, 64, %o3
+ fxor %f56, %f40, %f56
+ fxor %f58, %f42, %f58
+ subcc %o0, 64, %o0
+ fxor %f60, %f44, %f60
+ fxor %f62, %f46, %f62
+ stda %f48, [%o1 - 64] %asi
+ bne,pt %xcc, 3b
+ ldda [%o2] %asi, %f16
+
+ ldda [%o3] %asi, %f32
+ fxor %f0, %f16, %f48
+ fxor %f2, %f18, %f50
+ fxor %f4, %f20, %f52
+ fxor %f6, %f22, %f54
+ fxor %f8, %f24, %f56
+ fxor %f10, %f26, %f58
+ fxor %f12, %f28, %f60
+ fxor %f14, %f30, %f62
+ membar #Sync
+ fxor %f48, %f32, %f48
+ fxor %f50, %f34, %f50
+ fxor %f52, %f36, %f52
+ fxor %f54, %f38, %f54
+ fxor %f56, %f40, %f56
+ fxor %f58, %f42, %f58
+ fxor %f60, %f44, %f60
+ fxor %f62, %f46, %f62
+ stda %f48, [%o1] %asi
+ membar #Sync|#StoreStore|#StoreLoad
+ wr %g1, %g0, %asi
+ retl
+ wr %g0, 0, %fprs
+ .size xor_vis_3, .-xor_vis_3
+
+
+ .globl xor_vis_4
+ .type xor_vis_4,@function
+xor_vis_4:
+ rd %fprs, %g1
+ andcc %g1, FPRS_FEF|FPRS_DU, %g0
+ be,pt %icc, 0f
+ sethi %hi(VISenter), %g1
+ jmpl %g1 + %lo(VISenter), %g7
+ add %g7, 8, %g7
+0: wr %g0, FPRS_FEF, %fprs
+ rd %asi, %g1
+ wr %g0, ASI_BLK_P, %asi
+ membar #LoadStore|#StoreLoad|#StoreStore
+ sub %o0, 64, %o0
+ ldda [%o1] %asi, %f0
+ ldda [%o2] %asi, %f16
+
+4: ldda [%o3] %asi, %f32
+ fxor %f0, %f16, %f16
+ fxor %f2, %f18, %f18
+ add %o1, 64, %o1
+ fxor %f4, %f20, %f20
+ fxor %f6, %f22, %f22
+ add %o2, 64, %o2
+ fxor %f8, %f24, %f24
+ fxor %f10, %f26, %f26
+ fxor %f12, %f28, %f28
+ fxor %f14, %f30, %f30
+ ldda [%o4] %asi, %f48
+ fxor %f16, %f32, %f32
+ fxor %f18, %f34, %f34
+ fxor %f20, %f36, %f36
+ fxor %f22, %f38, %f38
+ add %o3, 64, %o3
+ fxor %f24, %f40, %f40
+ fxor %f26, %f42, %f42
+ fxor %f28, %f44, %f44
+ fxor %f30, %f46, %f46
+ ldda [%o1] %asi, %f0
+ fxor %f32, %f48, %f48
+ fxor %f34, %f50, %f50
+ fxor %f36, %f52, %f52
+ add %o4, 64, %o4
+ fxor %f38, %f54, %f54
+ fxor %f40, %f56, %f56
+ fxor %f42, %f58, %f58
+ subcc %o0, 64, %o0
+ fxor %f44, %f60, %f60
+ fxor %f46, %f62, %f62
+ stda %f48, [%o1 - 64] %asi
+ bne,pt %xcc, 4b
+ ldda [%o2] %asi, %f16
+
+ ldda [%o3] %asi, %f32
+ fxor %f0, %f16, %f16
+ fxor %f2, %f18, %f18
+ fxor %f4, %f20, %f20
+ fxor %f6, %f22, %f22
+ fxor %f8, %f24, %f24
+ fxor %f10, %f26, %f26
+ fxor %f12, %f28, %f28
+ fxor %f14, %f30, %f30
+ ldda [%o4] %asi, %f48
+ fxor %f16, %f32, %f32
+ fxor %f18, %f34, %f34
+ fxor %f20, %f36, %f36
+ fxor %f22, %f38, %f38
+ fxor %f24, %f40, %f40
+ fxor %f26, %f42, %f42
+ fxor %f28, %f44, %f44
+ fxor %f30, %f46, %f46
+ membar #Sync
+ fxor %f32, %f48, %f48
+ fxor %f34, %f50, %f50
+ fxor %f36, %f52, %f52
+ fxor %f38, %f54, %f54
+ fxor %f40, %f56, %f56
+ fxor %f42, %f58, %f58
+ fxor %f44, %f60, %f60
+ fxor %f46, %f62, %f62
+ stda %f48, [%o1] %asi
+ membar #Sync|#StoreStore|#StoreLoad
+ wr %g1, %g0, %asi
+ retl
+ wr %g0, 0, %fprs
+ .size xor_vis_4, .-xor_vis_4
+
+
+ .globl xor_vis_5
+ .type xor_vis_5,@function
+xor_vis_5:
+ rd %fprs, %g1
+ andcc %g1, FPRS_FEF|FPRS_DU, %g0
+ be,pt %icc, 0f
+ sethi %hi(VISenter), %g1
+ jmpl %g1 + %lo(VISenter), %g7
+ add %g7, 8, %g7
+0: wr %g0, FPRS_FEF, %fprs
+ rd %asi, %g1
+ wr %g0, ASI_BLK_P, %asi
+ membar #LoadStore|#StoreLoad|#StoreStore
+ sub %o0, 64, %o0
+ ldda [%o1] %asi, %f0
+ ldda [%o2] %asi, %f16
+
+5: ldda [%o3] %asi, %f32
+ fxor %f0, %f16, %f48
+ fxor %f2, %f18, %f50
+ add %o1, 64, %o1
+ fxor %f4, %f20, %f52
+ fxor %f6, %f22, %f54
+ add %o2, 64, %o2
+ fxor %f8, %f24, %f56
+ fxor %f10, %f26, %f58
+ fxor %f12, %f28, %f60
+ fxor %f14, %f30, %f62
+ ldda [%o4] %asi, %f16
+ fxor %f48, %f32, %f48
+ fxor %f50, %f34, %f50
+ fxor %f52, %f36, %f52
+ fxor %f54, %f38, %f54
+ add %o3, 64, %o3
+ fxor %f56, %f40, %f56
+ fxor %f58, %f42, %f58
+ fxor %f60, %f44, %f60
+ fxor %f62, %f46, %f62
+ ldda [%o5] %asi, %f32
+ fxor %f48, %f16, %f48
+ fxor %f50, %f18, %f50
+ add %o4, 64, %o4
+ fxor %f52, %f20, %f52
+ fxor %f54, %f22, %f54
+ add %o5, 64, %o5
+ fxor %f56, %f24, %f56
+ fxor %f58, %f26, %f58
+ fxor %f60, %f28, %f60
+ fxor %f62, %f30, %f62
+ ldda [%o1] %asi, %f0
+ fxor %f48, %f32, %f48
+ fxor %f50, %f34, %f50
+ fxor %f52, %f36, %f52
+ fxor %f54, %f38, %f54
+ fxor %f56, %f40, %f56
+ fxor %f58, %f42, %f58
+ subcc %o0, 64, %o0
+ fxor %f60, %f44, %f60
+ fxor %f62, %f46, %f62
+ stda %f48, [%o1 - 64] %asi
+ bne,pt %xcc, 5b
+ ldda [%o2] %asi, %f16
+
+ ldda [%o3] %asi, %f32
+ fxor %f0, %f16, %f48
+ fxor %f2, %f18, %f50
+ fxor %f4, %f20, %f52
+ fxor %f6, %f22, %f54
+ fxor %f8, %f24, %f56
+ fxor %f10, %f26, %f58
+ fxor %f12, %f28, %f60
+ fxor %f14, %f30, %f62
+ ldda [%o4] %asi, %f16
+ fxor %f48, %f32, %f48
+ fxor %f50, %f34, %f50
+ fxor %f52, %f36, %f52
+ fxor %f54, %f38, %f54
+ fxor %f56, %f40, %f56
+ fxor %f58, %f42, %f58
+ fxor %f60, %f44, %f60
+ fxor %f62, %f46, %f62
+ ldda [%o5] %asi, %f32
+ fxor %f48, %f16, %f48
+ fxor %f50, %f18, %f50
+ fxor %f52, %f20, %f52
+ fxor %f54, %f22, %f54
+ fxor %f56, %f24, %f56
+ fxor %f58, %f26, %f58
+ fxor %f60, %f28, %f60
+ fxor %f62, %f30, %f62
+ membar #Sync
+ fxor %f48, %f32, %f48
+ fxor %f50, %f34, %f50
+ fxor %f52, %f36, %f52
+ fxor %f54, %f38, %f54
+ fxor %f56, %f40, %f56
+ fxor %f58, %f42, %f58
+ fxor %f60, %f44, %f60
+ fxor %f62, %f46, %f62
+ stda %f48, [%o1] %asi
+ membar #Sync|#StoreStore|#StoreLoad
+ wr %g1, %g0, %asi
+ retl
+ wr %g0, 0, %fprs
+ .size xor_vis_5, .-xor_vis_5
+");
+
+static struct xor_block_template xor_block_VIS = {
+ name: "VIS",
+ do_2: xor_vis_2,
+ do_3: xor_vis_3,
+ do_4: xor_vis_4,
+ do_5: xor_vis_5,
+};
+
+#define XOR_TRY_TEMPLATES xor_speed(&xor_block_VIS)
@@ -73,7+73,7 @@ extern struct kernel_param __setup_start, __setup_end; * Mark functions and data as being only used at initialization
* or exit time.
*/
-#define __init __attribute__ ((__section__ (".text.init")))
+#define __init /* __attribute__ ((__section__ (".text.init"))) */
#define __exit __attribute__ ((unused, __section__(".text.exit")))
#define __initdata __attribute__ ((__section__ (".data.init")))
#define __exitdata __attribute__ ((unused, __section__ (".data.exit")))
-#ifndef _XOR_H
-#define _XOR_H
-
-#include <linux/raid/md.h>
-
-#define MAX_XOR_BLOCKS 4
-
-extern void calibrate_xor_block(void);
-extern void (*xor_block)(unsigned int count,
- struct buffer_head **bh_ptr);
-
-#endif
+#ifndef _XOR_H
+#define _XOR_H
+
+#include <linux/raid/md.h>
+
+#define MAX_XOR_BLOCKS 5
+
+extern void xor_block(unsigned int count, struct buffer_head **bh_ptr);
+
+struct xor_block_template {
+ struct xor_block_template *next;
+ const char *name;
+ int speed;
+ void (*do_2)(unsigned long, unsigned long *, unsigned long *);
+ void (*do_3)(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *);
+ void (*do_4)(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+ void (*do_5)(unsigned long, unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *, unsigned long *);
+};
+
+#endif
@@ -486,10+486,6 @@ EXPORT_SYMBOL(remove_inode_hash); EXPORT_SYMBOL(make_bad_inode);
EXPORT_SYMBOL(is_bad_inode);
EXPORT_SYMBOL(event);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
EXPORT_SYMBOL(brw_page);
#ifdef CONFIG_UID16
@@ -433,15+433,27 @@ static inline void __schedule_tail(struct task_struct *prev) int policy;
/*
+ * prev->policy can be written from here only before `prev'
+ * can be scheduled (before setting prev->has_cpu to zero).
+ * Of course it must also be read before allowing prev
+ * to be rescheduled, but since the write depends on the read
+ * to complete, wmb() is enough. (the spin_lock() acquired
+ * before setting has_cpu is not enough because the spin_lock()
+ * common code semantics allows code outside the critical section
+ * to enter inside the critical section)
+ */
+ policy = prev->policy;
+ prev->policy = policy & ~SCHED_YIELD;
+ wmb();
+
+ /*
* fast path falls through. We have to clear has_cpu before
* checking prev->state to avoid a wakeup race - thus we
* also have to protect against the task exiting early.
*/
task_lock(prev);
- policy = prev->policy;
- prev->policy = policy & ~SCHED_YIELD;
prev->has_cpu = 0;
- wmb();
+ mb();
if (prev->state == TASK_RUNNING)
goto needs_resched;