- pre4:2.4.0-test11pre4
authorLinus Torvalds<torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:39:59 +0000 (23 15:39 -0500)
committerLinus Torvalds<torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:39:59 +0000 (23 15:39 -0500)
   - Andrea Arcangeli: SMP scheduler memory barrier fixup
   - Richard Henderson: fix alpha semaphores and spinlock bugs.
   - Richard Henderson: clean up the file from hell: "xor.c"

32 files changed:
arch/alpha/config.in
arch/alpha/kernel/alpha_ksyms.c
arch/alpha/kernel/semaphore.c
arch/alpha/kernel/time.c
arch/alpha/lib/Makefile
arch/alpha/lib/semaphore.S[deleted file]
drivers/md/raid5.c
drivers/md/xor.c
drivers/net/irda/nsc-ircc.c
drivers/scsi/a2091.c
include/asm-alpha/atomic.h
include/asm-alpha/compiler.h
include/asm-alpha/semaphore-helper.h[deleted file]
include/asm-alpha/semaphore.h
include/asm-alpha/spinlock.h
include/asm-alpha/xor.h[new file with mode: 0644]
include/asm-arm/xor.h[new file with mode: 0644]
include/asm-generic/xor.h[new file with mode: 0644]
include/asm-i386/xor.h[new file with mode: 0644]
include/asm-ia64/xor.h[new file with mode: 0644]
include/asm-m68k/xor.h[new file with mode: 0644]
include/asm-mips/xor.h[new file with mode: 0644]
include/asm-mips64/xor.h[new file with mode: 0644]
include/asm-ppc/xor.h[new file with mode: 0644]
include/asm-s390/xor.h[new file with mode: 0644]
include/asm-sh/xor.h[new file with mode: 0644]
include/asm-sparc/xor.h[new file with mode: 0644]
include/asm-sparc64/xor.h[new file with mode: 0644]
include/linux/init.h
include/linux/raid/xor.h
kernel/ksyms.c
kernel/sched.c

index 2035708..e01e0c0 100644 (file)
@@ -63,7+63,6 @@ unset CONFIG_ALPHA_T2 CONFIG_ALPHA_PYXIS CONFIG_ALPHA_POLARIS
 unset CONFIG_ALPHA_TSUNAMI CONFIG_ALPHA_MCPCIA
 unset CONFIG_ALPHA_IRONGATE
 unset CONFIG_ALPHA_BROKEN_IRQ_MASK
-unset CONFIG_ALPHA_LARGE_VMALLOC
 
 # Most of these machines have ISA slots; not exactly sure which don't,
 # and this doesn't activate hordes of code, so do it always.
@@ -215,6+214,8 @@ if [ "$CONFIG_ALPHA_GENERIC" = "y" -o "$CONFIG_ALPHA_DP264" = "y" \
        -o "$CONFIG_ALPHA_WILDFIRE" = "y" -o "$CONFIG_ALPHA_TITAN" = "y" ]
 then
        bool 'Large VMALLOC support' CONFIG_ALPHA_LARGE_VMALLOC
+else
+       define_bool CONFIG_ALPHA_LARGE_VMALLOC n
 fi
 
 source drivers/pci/Config.in
index 4ac2e8b..17285ac 100644 (file)
@@ -160,15+160,20 @@ EXPORT_SYMBOL_NOVERS(__do_clear_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);
 
-/*
- * The following are specially called from the semaphore assembly stubs.
- */
-EXPORT_SYMBOL_NOVERS(__down_failed);
-EXPORT_SYMBOL_NOVERS(__down_failed_interruptible);
-EXPORT_SYMBOL_NOVERS(__up_wakeup);
-EXPORT_SYMBOL_NOVERS(__down_read_failed);
-EXPORT_SYMBOL_NOVERS(__down_write_failed);
-EXPORT_SYMBOL_NOVERS(__rwsem_wake);
+/* Semaphore helper functions.  */
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__up_wakeup);
+EXPORT_SYMBOL(down);
+EXPORT_SYMBOL(down_interruptible);
+EXPORT_SYMBOL(up);
+EXPORT_SYMBOL(__down_read_failed);
+EXPORT_SYMBOL(__down_write_failed);
+EXPORT_SYMBOL(__rwsem_wake);
+EXPORT_SYMBOL(down_read);
+EXPORT_SYMBOL(down_write);
+EXPORT_SYMBOL(up_read);
+EXPORT_SYMBOL(up_write);
 
 /* 
  * SMP-specific symbols.
dissimilarity index 72%
index e6903f4..7c1c9a8 100644 (file)
-/*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/sched.h>
-#include <asm/semaphore-helper.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-void
-__up(struct semaphore *sem)
-{
-       wake_one_more(sem);
-       wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-#define DOWN_VAR                               \
-       struct task_struct *tsk = current;      \
-       wait_queue_t wait;                      \
-       init_waitqueue_entry(&wait, tsk)
-
-#define DOWN_HEAD(task_state)                                          \
-                                                                       \
-                                                                       \
-       tsk->state = (task_state);                                      \
-       add_wait_queue(&sem->wait, &wait);                              \
-                                                                       \
-       /*                                                              \
-        * Ok, we're set up.  sem->count is known to be less than zero  \
-        * so we must wait.                                             \
-        *                                                              \
-        * We can let go the lock for purposes of waiting.              \
-        * We re-acquire it after awaking so as to protect              \
-        * all semaphore operations.                                    \
-        *                                                              \
-        * If "up()" is called before we call waking_non_zero() then    \
-        * we will catch it right away.  If it is called later then     \
-        * we will have to go through a wakeup cycle to catch it.       \
-        *                                                              \
-        * Multiple waiters contend for the semaphore lock to see       \
-        * who gets to gate through and who has to wait some more.      \
-        */                                                             \
-       for (;;) {
-
-#define DOWN_TAIL(task_state)                  \
-               tsk->state = (task_state);      \
-       }                                       \
-       tsk->state = TASK_RUNNING;              \
-       remove_wait_queue(&sem->wait, &wait)
-
-void
-__down(struct semaphore * sem)
-{
-       DOWN_VAR;
-       DOWN_HEAD(TASK_UNINTERRUPTIBLE);
-
-       if (waking_non_zero(sem))
-               break;
-       schedule();
-
-       DOWN_TAIL(TASK_UNINTERRUPTIBLE);
-}
-
-int
-__down_interruptible(struct semaphore * sem)
-{
-       int ret = 0;
-       DOWN_VAR;
-       DOWN_HEAD(TASK_INTERRUPTIBLE);
-
-       ret = waking_non_zero_interruptible(sem, tsk);
-       if (ret)
-       {
-               if (ret == 1)
-                       /* ret != 0 only if we get interrupted -arca */
-                       ret = 0;
-               break;
-       }
-       schedule();
-
-       DOWN_TAIL(TASK_INTERRUPTIBLE);
-       return ret;
-}
-
-int
-__down_trylock(struct semaphore * sem)
-{
-       return waking_non_zero_trylock(sem);
-}
-
-
-/*
- * RW Semaphores
- */
-
-void
-__down_read(struct rw_semaphore *sem, int count)
-{
-       long tmp;
-       DOWN_VAR;
-
- retry_down:
-       if (count < 0) {
-               /* Wait for the lock to become unbiased.  Readers
-                  are non-exclusive.  */
-               
-               /* This takes care of granting the lock.  */
-               up_read(sem);
-
-               add_wait_queue(&sem->wait, &wait);
-               while (sem->count < 0) {
-                       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-                       if (sem->count >= 0)
-                               break;
-                       schedule();
-               }
-
-               remove_wait_queue(&sem->wait, &wait);
-               tsk->state = TASK_RUNNING;
-
-               __asm __volatile (
-                       "       mb\n"
-                       "1:     ldl_l   %0,%1\n"
-                       "       subl    %0,1,%2\n"
-                       "       subl    %0,1,%0\n"
-                       "       stl_c   %2,%1\n"
-                       "       bne     %2,2f\n"
-                       ".subsection 2\n"
-                       "2:     br      1b\n"
-                       ".previous"
-                       : "=r"(count), "=m"(sem->count), "=r"(tmp)
-                       : : "memory");
-               if (count <= 0)
-                       goto retry_down;
-       } else {
-               add_wait_queue(&sem->wait, &wait);
-
-               while (1) {
-                       if (test_and_clear_bit(0, &sem->granted))
-                               break;
-                       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-                       if ((sem->granted & 1) == 0)
-                               schedule();
-               }
-
-               remove_wait_queue(&sem->wait, &wait);
-               tsk->state = TASK_RUNNING;
-       }
-}
-
-void
-__down_write(struct rw_semaphore *sem, int count)
-{
-       long tmp;
-       DOWN_VAR;
-
- retry_down:
-       if (count + RW_LOCK_BIAS < 0) {
-               up_write(sem);
-
-               add_wait_queue_exclusive(&sem->wait, &wait);
-       
-               while (sem->count < 0) {
-                       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-                       if (sem->count >= RW_LOCK_BIAS)
-                               break;
-                       schedule();
-               }
-
-               remove_wait_queue(&sem->wait, &wait);
-               tsk->state = TASK_RUNNING;
-
-               __asm __volatile (
-                       "       mb\n"
-                       "1:     ldl_l   %0,%1\n"
-                       "       ldah    %2,%3(%0)\n"
-                       "       ldah    %0,%3(%0)\n"
-                       "       stl_c   %2,%1\n"
-                       "       bne     %2,2f\n"
-                       ".subsection 2\n"
-                       "2:     br      1b\n"
-                       ".previous"
-                       : "=r"(count), "=m"(sem->count), "=r"(tmp)
-                       : "i"(-(RW_LOCK_BIAS >> 16))
-                       : "memory");
-               if (count != 0)
-                       goto retry_down;
-       } else {
-               /* Put ourselves at the end of the list.  */
-               add_wait_queue_exclusive(&sem->write_bias_wait, &wait);
-
-               while (1) {
-                       if (test_and_clear_bit(1, &sem->granted))
-                               break;
-                       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-                       if ((sem->granted & 2) == 0)
-                               schedule();
-               }
-
-               remove_wait_queue(&sem->write_bias_wait, &wait);
-               tsk->state = TASK_RUNNING;
-
-               /* If the lock is currently unbiased, awaken the sleepers.
-                  FIXME: This wakes up the readers early in a bit of a
-                  stampede -> bad!  */
-               if (sem->count >= 0)
-                       wake_up(&sem->wait);
-       }
-}
-
-void
-__do_rwsem_wake(struct rw_semaphore *sem, int readers)
-{
-       if (readers) {
-               if (test_and_set_bit(0, &sem->granted))
-                       BUG();
-               wake_up(&sem->wait);
-       } else {
-               if (test_and_set_bit(1, &sem->granted))
-                       BUG();
-               wake_up(&sem->write_bias_wait);
-       }
-}
+/*
+ * Alpha semaphore implementation.
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ * (C) Copyright 1999, 2000 Richard Henderson
+ */
+
+#include <linux/sched.h>
+
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ * 
+ * The "count" variable is decremented for each process that tries to sleep,
+ * while the "waking" variable is incremented when the "up()" code goes to
+ * wake up waiting processes.
+ *
+ * Notably, the inline "up()" and "down()" functions can efficiently test
+ * if they need to do any extra work (up needs to do something only if count
+ * was negative before the increment operation.
+ *
+ * waking_non_zero() (from asm/semaphore.h) must execute atomically.
+ *
+ * When __up() is called, the count was negative before incrementing it,
+ * and we need to wake up somebody.
+ *
+ * This routine adds one to the count of processes that need to wake up and
+ * exit.  ALL waiting processes actually wake up but only the one that gets
+ * to the "waking" field first will gate through and acquire the semaphore.
+ * The others will go back to sleep.
+ *
+ * Note that these functions are only called when there is contention on the
+ * lock, and as such all this is the "non-critical" part of the whole
+ * semaphore business. The critical part is the inline stuff in
+ * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
+ */
+
+/*
+ * Perform the "down" function.  Return zero for semaphore acquired,
+ * return negative for signalled out of the function.
+ *
+ * If called from down, the return is ignored and the wait loop is
+ * not interruptible.  This means that a task waiting on a semaphore
+ * using "down()" cannot be killed until someone does an "up()" on
+ * the semaphore.
+ *
+ * If called from down_interruptible, the return value gets checked
+ * upon return.  If the return value is negative then the task continues
+ * with the negative value in the return register (it can be tested by
+ * the caller).
+ *
+ * Either form may be used in conjunction with "up()".
+ */
+
+void
+__down_failed(struct semaphore *sem)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down failed(%p)\n",
+              current->comm, current->pid, sem);
+#endif
+
+       current->state = TASK_UNINTERRUPTIBLE;
+       wmb();
+       add_wait_queue_exclusive(&sem->wait, &wait);
+
+       /* At this point we know that sem->count is negative.  In order
+          to avoid racing with __up, we must check for wakeup before
+          going to sleep the first time.  */
+
+       while (1) {
+               long ret, tmp;
+
+               /* An atomic conditional decrement of sem->waking.  */
+               __asm__ __volatile__(
+                       "1:     ldl_l   %1,%2\n"
+                       "       blt     %1,2f\n"
+                       "       subl    %1,1,%0\n"
+                       "       stl_c   %0,%2\n"
+                       "       beq     %0,3f\n"
+                       "2:\n"
+                       ".subsection 2\n"
+                       "3:     br      1b\n"
+                       ".previous"
+                       : "=r"(ret), "=&r"(tmp), "=m"(sem->waking)
+                       : "0"(0));
+
+               if (ret)
+                       break;
+
+               schedule();
+               set_task_state(current, TASK_UNINTERRUPTIBLE);
+       }
+
+       remove_wait_queue(&sem->wait, &wait);
+       current->state = TASK_RUNNING;
+
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down acquired(%p)\n",
+              current->comm, current->pid, sem);
+#endif
+}
+
+int
+__down_failed_interruptible(struct semaphore *sem)
+{
+       DECLARE_WAITQUEUE(wait, current);
+       long ret;
+
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down failed(%p)\n",
+              current->comm, current->pid, sem);
+#endif
+
+       current->state = TASK_INTERRUPTIBLE;
+       wmb();
+       add_wait_queue_exclusive(&sem->wait, &wait);
+
+       while (1) {
+               long tmp, tmp2, tmp3;
+
+               /* We must undo the sem->count down_interruptible decrement
+                  simultaneously and atomicly with the sem->waking
+                  adjustment, otherwise we can race with __up.  This is
+                  accomplished by doing a 64-bit ll/sc on two 32-bit words.
+               
+                  "Equivalent" C.  Note that we have to do this all without
+                  (taken) branches in order to be a valid ll/sc sequence.
+
+                  do {
+                      tmp = ldq_l;
+                      ret = 0;
+                      if (tmp >= 0) {                  // waking >= 0
+                          tmp += 0xffffffff00000000;   // waking -= 1
+                          ret = 1;
+                      }
+                      else if (pending) {
+                          // count += 1, but since -1 + 1 carries into the
+                          // high word, we have to be more careful here.
+                          tmp = (tmp & 0xffffffff00000000)
+                                | ((tmp + 1) & 0x00000000ffffffff);
+                          ret = -EINTR;
+                      }
+                      tmp = stq_c = tmp;
+                  } while (tmp == 0);
+               */
+
+               __asm__ __volatile__(
+                       "1:     ldq_l   %1,%4\n"
+                       "       lda     %0,0\n"
+                       "       cmovne  %5,%6,%0\n"
+                       "       addq    %1,1,%2\n"
+                       "       and     %1,%7,%3\n"
+                       "       andnot  %2,%7,%2\n"
+                       "       cmovge  %1,1,%0\n"
+                       "       or      %3,%2,%2\n"
+                       "       addq    %1,%7,%3\n"
+                       "       cmovne  %5,%2,%1\n"
+                       "       cmovge  %2,%3,%1\n"
+                       "       stq_c   %1,%4\n"
+                       "       beq     %1,3f\n"
+                       "2:\n"
+                       ".subsection 2\n"
+                       "3:     br      1b\n"
+                       ".previous"
+                       : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2),
+                         "=&r"(tmp3), "=m"(*sem)
+                       : "r"(signal_pending(current)), "r"(-EINTR),
+                         "r"(0xffffffff00000000));
+
+               /* At this point we have ret
+                       1       got the lock
+                       0       go to sleep
+                       -EINTR  interrupted  */
+               if (ret != 0)
+                       break;
+
+               schedule();
+               set_task_state(current, TASK_INTERRUPTIBLE);
+       }
+
+       remove_wait_queue(&sem->wait, &wait);
+       current->state = TASK_RUNNING;
+       wake_up(&sem->wait);
+
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down %s(%p)\n",
+              current->comm, current->pid,
+              (ret < 0 ? "interrupted" : "acquired"), sem);
+#endif
+
+       /* Convert "got the lock" to 0==success.  */
+       return (ret < 0 ? ret : 0);
+}
+
+void
+__up_wakeup(struct semaphore *sem)
+{
+       wake_up(&sem->wait);
+}
+
+void
+down(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down(%p) <count=%d> from %p\n",
+              current->comm, current->pid, sem,
+              atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+       __down(sem);
+}
+
+int
+down_interruptible(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down(%p) <count=%d> from %p\n",
+              current->comm, current->pid, sem,
+              atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+       return __down_interruptible(sem);
+}
+
+int
+down_trylock(struct semaphore *sem)
+{
+       int ret;
+
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+#endif
+
+       ret = __down_trylock(sem);
+
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): down_trylock %s from %p\n",
+              current->comm, current->pid,
+              ret ? "failed" : "acquired",
+              __builtin_return_address(0));
+#endif
+
+       return ret;
+}
+
+void
+up(struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+#endif
+#if DEBUG_SEMAPHORE
+       printk("%s(%d): up(%p) <count=%d> from %p\n",
+              current->comm, current->pid, sem,
+              atomic_read(&sem->count), __builtin_return_address(0));
+#endif
+       __up(sem);
+}
+
+
+/*
+ * RW Semaphores
+ */
+
+void
+__down_read_failed(struct rw_semaphore *sem, int count)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+ retry_down:
+       if (count < 0) {
+               /* Waiting on multiple readers and/or writers.  */
+               
+               /* Undo the acquisition we started in down_read.  */
+               atomic_inc(&sem->count);
+
+               current->state = TASK_UNINTERRUPTIBLE;
+               wmb();
+               add_wait_queue(&sem->wait, &wait);
+               mb();
+               while (atomic_read(&sem->count) < 0) {
+                       schedule();
+                       set_task_state(current, TASK_UNINTERRUPTIBLE);
+               }
+
+               remove_wait_queue(&sem->wait, &wait);
+               current->state = TASK_RUNNING;
+
+               mb();
+               count = atomic_dec_return(&sem->count);
+               if (count <= 0)
+                       goto retry_down;
+       } else {
+               /* Waiting on exactly one writer.  */
+
+               current->state = TASK_UNINTERRUPTIBLE;
+               wmb();
+               add_wait_queue(&sem->wait, &wait);
+               mb();
+
+               while (!test_and_clear_bit(0, &sem->granted)) {
+                       schedule();
+                       set_task_state(current, TASK_UNINTERRUPTIBLE);
+               }
+
+               remove_wait_queue(&sem->wait, &wait);
+               current->state = TASK_RUNNING;
+       }
+}
+
+void
+__down_write_failed(struct rw_semaphore *sem, int count)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+ retry_down:
+       if (count + RW_LOCK_BIAS < 0) {
+               /* Waiting on multiple readers and/or writers.  */
+
+               /* Undo the acquisition we started in down_write.  */
+               atomic_add(RW_LOCK_BIAS, &sem->count);
+
+               current->state = TASK_UNINTERRUPTIBLE;
+               wmb();
+               add_wait_queue_exclusive(&sem->wait, &wait);
+               mb();
+       
+               while (atomic_read(&sem->count) + RW_LOCK_BIAS < 0) {
+                       schedule();
+                       set_task_state(current, TASK_UNINTERRUPTIBLE);
+               }
+
+               remove_wait_queue(&sem->wait, &wait);
+               current->state = TASK_RUNNING;
+
+               count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
+               if (count != 0)
+                       goto retry_down;
+       } else {
+               /* Waiting on exactly one writer.  */
+
+               current->state = TASK_UNINTERRUPTIBLE;
+               wmb();
+               add_wait_queue_exclusive(&sem->wait, &wait);
+               mb();
+
+               while (!test_and_clear_bit(1, &sem->granted)) {
+                       schedule();
+                       set_task_state(current, TASK_UNINTERRUPTIBLE);
+               }
+
+               remove_wait_queue(&sem->write_bias_wait, &wait);
+               current->state = TASK_RUNNING;
+
+               /* If the lock is currently unbiased, awaken the sleepers.
+                  FIXME: This wakes up the readers early in a bit of a
+                  stampede -> bad!  */
+               count = atomic_read(&sem->count);
+               if (__builtin_expect(count >= 0, 0))
+                       wake_up(&sem->wait);
+       }
+}
+
+void
+__rwsem_wake(struct rw_semaphore *sem, int readers)
+{
+       if (readers) {
+               if (test_and_set_bit(0, &sem->granted))
+                       BUG();
+               wake_up(&sem->wait);
+       } else {
+               if (test_and_set_bit(1, &sem->granted))
+                       BUG();
+               wake_up(&sem->write_bias_wait);
+       }
+}
+
+void
+down_read(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+#endif
+       __down_read(sem);
+#if WAITQUEUE_DEBUG
+       if (sem->granted & 2)
+               BUG();
+       if (atomic_read(&sem->writers))
+               BUG();
+       atomic_inc(&sem->readers);
+#endif
+}
+
+void
+down_write(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+#endif
+       __down_write(sem);
+#if WAITQUEUE_DEBUG
+       if (sem->granted & 3)
+               BUG();
+       if (atomic_read(&sem->writers))
+               BUG();
+       if (atomic_read(&sem->readers))
+               BUG();
+       atomic_inc(&sem->writers);
+#endif
+}
+
+void
+up_read(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+       if (sem->granted & 2)
+               BUG();
+       if (atomic_read(&sem->writers))
+               BUG();
+       atomic_dec(&sem->readers);
+#endif
+       __up_read(sem);
+}
+
+void
+up_write(struct rw_semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+       CHECK_MAGIC(sem->__magic);
+       if (sem->granted & 3)
+               BUG();
+       if (atomic_read(&sem->readers))
+               BUG();
+       if (atomic_read(&sem->writers) != 1)
+               BUG();
+       atomic_dec(&sem->writers);
+#endif
+       __up_write(sem);
+}
index 0edf608..bc7beb7 100644 (file)
@@ -378,6+378,9 @@ do_settimeofday(struct timeval *tv)
  * BUG: This routine does not handle hour overflow properly; it just
  *      sets the minutes. Usually you won't notice until after reboot!
  */
+
+extern int abs(int);
+
 static int
 set_rtc_mmss(unsigned long nowtime)
 {
index d22a6f5..913331a 100644 (file)
@@ -12,7+12,7 @@ OBJS  = __divqu.o __remqu.o __divlu.o __remlu.o memset.o memcpy.o io.o \
        strcat.o strcpy.o strncat.o strncpy.o stxcpy.o stxncpy.o \
        strchr.o strrchr.o memchr.o \
        copy_user.o clear_user.o strncpy_from_user.o strlen_user.o \
-       csum_ipv6_magic.o strcasecmp.o semaphore.o fpreg.o \
+       csum_ipv6_magic.o strcasecmp.o fpreg.o \
        callback_srm.o srm_puts.o srm_printk.o
 
 lib.a: $(OBJS)
diff --git a/arch/alpha/lib/semaphore.S b/arch/alpha/lib/semaphore.S
deleted file mode 100644 (file)
index 517285e..0000000
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
- *  linux/arch/alpha/lib/semaphore.S
- *
- *  Copyright (C) 1999, 2000  Richard Henderson
- */
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- */
-
-       .set noat
-       .set noreorder
-       .align 4
-
-/* __down_failed takes the semaphore in $24, clobbers $24 and $28.  */
-
-       .globl  __down_failed
-       .ent    __down_failed
-__down_failed:
-       ldgp    $29,0($27)
-       lda     $30, -20*8($30)
-       stq     $28, 0*8($30)
-       stq     $0, 1*8($30)
-       stq     $1, 2*8($30)
-       stq     $2, 3*8($30)
-       stq     $3, 4*8($30)
-       stq     $4, 5*8($30)
-       stq     $5, 6*8($30)
-       stq     $6, 7*8($30)
-       stq     $7, 8*8($30)
-       stq     $16, 9*8($30)
-       stq     $17, 10*8($30)
-       stq     $18, 11*8($30)
-       stq     $19, 12*8($30)
-       stq     $20, 13*8($30)
-       stq     $21, 14*8($30)
-       stq     $22, 15*8($30)
-       stq     $23, 16*8($30)
-       stq     $25, 17*8($30)
-       stq     $26, 18*8($30)
-       .frame $30, 20*8, $28
-       .prologue 1
-       
-       mov     $24, $16
-       jsr     __down
-       
-       ldq     $28, 0*8($30)
-       ldq     $0, 1*8($30)
-       ldq     $1, 2*8($30)
-       ldq     $2, 3*8($30)
-       ldq     $3, 4*8($30)
-       ldq     $4, 5*8($30)
-       ldq     $5, 6*8($30)
-       ldq     $6, 7*8($30)
-       ldq     $7, 8*8($30)
-       ldq     $16, 9*8($30)
-       ldq     $17, 10*8($30)
-       ldq     $18, 11*8($30)
-       ldq     $19, 12*8($30)
-       ldq     $20, 13*8($30)
-       ldq     $21, 14*8($30)
-       ldq     $22, 15*8($30)
-       ldq     $23, 16*8($30)
-       ldq     $25, 17*8($30)
-       ldq     $26, 18*8($30)
-       lda     $30, 20*8($30)
-       ret     $31, ($28), 0
-       .end    __down_failed
-
-/* __down_failed_interruptible takes the semaphore in $24,
-   clobbers $28, returns success in $24.  */
-
-       .globl  __down_failed_interruptible
-       .ent    __down_failed_interruptible
-__down_failed_interruptible:
-       ldgp    $29,0($27)
-       lda     $30, -20*8($30)
-       stq     $28, 0*8($30)
-       stq     $0, 1*8($30)
-       stq     $1, 2*8($30)
-       stq     $2, 3*8($30)
-       stq     $3, 4*8($30)
-       stq     $4, 5*8($30)
-       stq     $5, 6*8($30)
-       stq     $6, 7*8($30)
-       stq     $7, 8*8($30)
-       stq     $16, 9*8($30)
-       stq     $17, 10*8($30)
-       stq     $18, 11*8($30)
-       stq     $19, 12*8($30)
-       stq     $20, 13*8($30)
-       stq     $21, 14*8($30)
-       stq     $22, 15*8($30)
-       stq     $23, 16*8($30)
-       stq     $25, 17*8($30)
-       stq     $26, 18*8($30)
-       .frame $30, 20*8, $28
-       .prologue 1
-       
-       mov     $24, $16
-       jsr     __down_interruptible
-       mov     $0, $24
-       
-       ldq     $28, 0*8($30)
-       ldq     $0, 1*8($30)
-       ldq     $1, 2*8($30)
-       ldq     $2, 3*8($30)
-       ldq     $3, 4*8($30)
-       ldq     $4, 5*8($30)
-       ldq     $5, 6*8($30)
-       ldq     $6, 7*8($30)
-       ldq     $7, 8*8($30)
-       ldq     $16, 9*8($30)
-       ldq     $17, 10*8($30)
-       ldq     $18, 11*8($30)
-       ldq     $19, 12*8($30)
-       ldq     $20, 13*8($30)
-       ldq     $21, 14*8($30)
-       ldq     $22, 15*8($30)
-       ldq     $23, 16*8($30)
-       ldq     $25, 17*8($30)
-       ldq     $26, 18*8($30)
-       lda     $30, 20*8($30)
-       ret     $31, ($28), 0
-       .end    __down_failed_interruptible
-
-/* __up_wakeup takes the semaphore in $24, clobbers $24 and $28.  */
-
-       .globl  __up_wakeup
-       .ent    __up_wakeup
-__up_wakeup:
-       ldgp    $29,0($27)
-       lda     $30, -20*8($30)
-       stq     $28, 0*8($30)
-       stq     $0, 1*8($30)
-       stq     $1, 2*8($30)
-       stq     $2, 3*8($30)
-       stq     $3, 4*8($30)
-       stq     $4, 5*8($30)
-       stq     $5, 6*8($30)
-       stq     $6, 7*8($30)
-       stq     $7, 8*8($30)
-       stq     $16, 9*8($30)
-       stq     $17, 10*8($30)
-       stq     $18, 11*8($30)
-       stq     $19, 12*8($30)
-       stq     $20, 13*8($30)
-       stq     $21, 14*8($30)
-       stq     $22, 15*8($30)
-       stq     $23, 16*8($30)
-       stq     $25, 17*8($30)
-       stq     $26, 18*8($30)
-       .frame $30, 20*8, $28
-       .prologue 1
-       
-       mov     $24, $16
-       jsr     __up
-       
-       ldq     $28, 0*8($30)
-       ldq     $0, 1*8($30)
-       ldq     $1, 2*8($30)
-       ldq     $2, 3*8($30)
-       ldq     $3, 4*8($30)
-       ldq     $4, 5*8($30)
-       ldq     $5, 6*8($30)
-       ldq     $6, 7*8($30)
-       ldq     $7, 8*8($30)
-       ldq     $16, 9*8($30)
-       ldq     $17, 10*8($30)
-       ldq     $18, 11*8($30)
-       ldq     $19, 12*8($30)
-       ldq     $20, 13*8($30)
-       ldq     $21, 14*8($30)
-       ldq     $22, 15*8($30)
-       ldq     $23, 16*8($30)
-       ldq     $25, 17*8($30)
-       ldq     $26, 18*8($30)
-       lda     $30, 20*8($30)
-       ret     $31, ($28), 0
-       .end    __up_wakeup
-
-/* __down_read_failed takes the semaphore in $24, count in $25;
-   clobbers $24, $25 and $28.  */
-
-       .globl  __down_read_failed
-       .ent    __down_read_failed
-__down_read_failed:
-       ldgp    $29,0($27)
-       lda     $30, -18*8($30)
-       stq     $28, 0*8($30)
-       stq     $0, 1*8($30)
-       stq     $1, 2*8($30)
-       stq     $2, 3*8($30)
-       stq     $3, 4*8($30)
-       stq     $4, 5*8($30)
-       stq     $5, 6*8($30)
-       stq     $6, 7*8($30)
-       stq     $7, 8*8($30)
-       stq     $16, 9*8($30)
-       stq     $17, 10*8($30)
-       stq     $18, 11*8($30)
-       stq     $19, 12*8($30)
-       stq     $20, 13*8($30)
-       stq     $21, 14*8($30)
-       stq     $22, 15*8($30)
-       stq     $23, 16*8($30)
-       stq     $26, 17*8($30)
-       .frame $30, 18*8, $28
-       .prologue 1
-       
-       mov     $24, $16
-       mov     $25, $17
-       jsr     __down_read
-       
-       ldq     $28, 0*8($30)
-       ldq     $0, 1*8($30)
-       ldq     $1, 2*8($30)
-       ldq     $2, 3*8($30)
-       ldq     $3, 4*8($30)
-       ldq     $4, 5*8($30)
-       ldq     $5, 6*8($30)
-       ldq     $6, 7*8($30)
-       ldq     $7, 8*8($30)
-       ldq     $16, 9*8($30)
-       ldq     $17, 10*8($30)
-       ldq     $18, 11*8($30)
-       ldq     $19, 12*8($30)
-       ldq     $20, 13*8($30)
-       ldq     $21, 14*8($30)
-       ldq     $22, 15*8($30)
-       ldq     $23, 16*8($30)
-       ldq     $26, 17*8($30)
-       lda     $30, 18*8($30)
-       ret     $31, ($28), 0
-       .end    __down_read_failed
-
-/* __down_write_failed takes the semaphore in $24, count in $25;
-   clobbers $24, $25 and $28.  */
-
-       .globl  __down_write_failed
-       .ent    __down_write_failed
-__down_write_failed:
-       ldgp    $29,0($27)
-       lda     $30, -20*8($30)
-       stq     $28, 0*8($30)
-       stq     $0, 1*8($30)
-       stq     $1, 2*8($30)
-       stq     $2, 3*8($30)
-       stq     $3, 4*8($30)
-       stq     $4, 5*8($30)
-       stq     $5, 6*8($30)
-       stq     $6, 7*8($30)
-       stq     $7, 8*8($30)
-       stq     $16, 9*8($30)
-       stq     $17, 10*8($30)
-       stq     $18, 11*8($30)
-       stq     $19, 12*8($30)
-       stq     $20, 13*8($30)
-       stq     $21, 14*8($30)
-       stq     $22, 15*8($30)
-       stq     $23, 16*8($30)
-       stq     $26, 17*8($30)
-       .frame $30, 18*8, $28
-       .prologue 1
-       
-       mov     $24, $16
-       mov     $25, $17
-       jsr     __down_write
-       
-       ldq     $28, 0*8($30)
-       ldq     $0, 1*8($30)
-       ldq     $1, 2*8($30)
-       ldq     $2, 3*8($30)
-       ldq     $3, 4*8($30)
-       ldq     $4, 5*8($30)
-       ldq     $5, 6*8($30)
-       ldq     $6, 7*8($30)
-       ldq     $7, 8*8($30)
-       ldq     $16, 9*8($30)
-       ldq     $17, 10*8($30)
-       ldq     $18, 11*8($30)
-       ldq     $19, 12*8($30)
-       ldq     $20, 13*8($30)
-       ldq     $21, 14*8($30)
-       ldq     $22, 15*8($30)
-       ldq     $23, 16*8($30)
-       ldq     $26, 17*8($30)
-       lda     $30, 18*8($30)
-       ret     $31, ($28), 0
-       .end    __down_write_failed
-
-/* __rwsem_wake takes the semaphore in $24, readers in $25;
-   clobbers $24, $25, and $28.  */
-
-       .globl  __rwsem_wake
-       .ent    __rwsem_wake
-__rwsem_wake:
-       ldgp    $29,0($27)
-       lda     $30, -18*8($30)
-       stq     $28, 0*8($30)
-       stq     $0, 1*8($30)
-       stq     $1, 2*8($30)
-       stq     $2, 3*8($30)
-       stq     $3, 4*8($30)
-       stq     $4, 5*8($30)
-       stq     $5, 6*8($30)
-       stq     $6, 7*8($30)
-       stq     $7, 8*8($30)
-       stq     $16, 9*8($30)
-       stq     $17, 10*8($30)
-       stq     $18, 11*8($30)
-       stq     $19, 12*8($30)
-       stq     $20, 13*8($30)
-       stq     $21, 14*8($30)
-       stq     $22, 15*8($30)
-       stq     $23, 16*8($30)
-       stq     $26, 17*8($30)
-       .frame $30, 18*8, $28
-       .prologue 1
-       
-       mov     $24, $16
-       mov     $25, $17
-       jsr     __do_rwsem_wake
-       
-       ldq     $28, 0*8($30)
-       ldq     $0, 1*8($30)
-       ldq     $1, 2*8($30)
-       ldq     $2, 3*8($30)
-       ldq     $3, 4*8($30)
-       ldq     $4, 5*8($30)
-       ldq     $5, 6*8($30)
-       ldq     $6, 7*8($30)
-       ldq     $7, 8*8($30)
-       ldq     $16, 9*8($30)
-       ldq     $17, 10*8($30)
-       ldq     $18, 11*8($30)
-       ldq     $19, 12*8($30)
-       ldq     $20, 13*8($30)
-       ldq     $21, 14*8($30)
-       ldq     $22, 15*8($30)
-       ldq     $23, 16*8($30)
-       ldq     $26, 17*8($30)
-       lda     $30, 18*8($30)
-       ret     $31, ($28), 0
-       .end    __rwsem_wake
index cff836d..ff67aad 100644 (file)
@@ -2344,18+2344,7 @@ static mdk_personality_t raid5_personality=
 
 int raid5_init (void)
 {
-       int err;
-
-       err = register_md_personality (RAID5, &raid5_personality);
-       if (err)
-               return err;
-
-       /*
-        * pick a XOR routine, runtime.
-        */
-       calibrate_xor_block();
-
-       return 0;
+       return register_md_personality (RAID5, &raid5_personality);
 }
 
 #ifdef MODULE
dissimilarity index 98%
index 4fe04fb..f58463e 100644 (file)
-/*
- * xor.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek
- *
- *
- * optimized RAID-5 checksumming functions.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/config.h>
-#define BH_TRACE 0
-#include <linux/module.h>
-#include <linux/raid/md.h>
-#ifdef __sparc_v9__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/visasm.h>
-#endif
-
-/*
- * we use the 'XOR function template' to register multiple xor
- * functions runtime. The kernel measures their speed upon bootup
- * and decides which one to use. (compile-time registration is
- * not enough as certain CPU features like MMX can only be detected
- * runtime)
- *
- * this architecture makes it pretty easy to add new routines
- * that are faster on certain CPUs, without killing other CPU's
- * 'native' routine. Although the current routines are belived
- * to be the physically fastest ones on all CPUs tested, but
- * feel free to prove me wrong and add yet another routine =B-)
- * --mingo
- */
-
-#define MAX_XOR_BLOCKS 5
-
-#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr)
-
-typedef void (*xor_block_t) XOR_ARGS;
-xor_block_t xor_block = NULL;
-
-#ifndef __sparc_v9__
-
-struct xor_block_template;
-
-struct xor_block_template {
-       char * name;
-       xor_block_t xor_block;
-       int speed;
-       struct xor_block_template * next;
-};
-
-struct xor_block_template * xor_functions = NULL;
-
-#define XORBLOCK_TEMPLATE(x) \
-static void xor_block_##x XOR_ARGS; \
-static struct xor_block_template t_xor_block_##x = \
-                                { #x, xor_block_##x, 0, NULL }; \
-static void xor_block_##x XOR_ARGS
-
-#ifdef __i386__
-
-#ifdef CONFIG_X86_XMM
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-XORBLOCK_TEMPLATE(pIII_kni)
-{
-       char xmm_save[16*4];
-       int cr0;
-        int lines = (bh_ptr[0]->b_size>>8);
-
-       __asm__ __volatile__ ( 
-               "movl %%cr0,%0          ;\n\t"
-               "clts                   ;\n\t"
-               "movups %%xmm0,(%1)     ;\n\t"
-               "movups %%xmm1,0x10(%1) ;\n\t"
-               "movups %%xmm2,0x20(%1) ;\n\t"
-               "movups %%xmm3,0x30(%1) ;\n\t"
-               : "=r" (cr0)
-               : "r" (xmm_save) 
-               : "memory" );
-
-#define OFFS(x) "8*("#x"*2)"
-#define        PF0(x) \
-       "       prefetcht0  "OFFS(x)"(%1)   ;\n"
-#define LD(x,y) \
-        "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
-#define ST(x,y) \
-        "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
-#define PF1(x) \
-       "       prefetchnta "OFFS(x)"(%2)   ;\n"
-#define PF2(x) \
-       "       prefetchnta "OFFS(x)"(%3)   ;\n"
-#define PF3(x) \
-       "       prefetchnta "OFFS(x)"(%4)   ;\n"
-#define PF4(x) \
-       "       prefetchnta "OFFS(x)"(%5)   ;\n"
-#define PF5(x) \
-       "       prefetchnta "OFFS(x)"(%6)   ;\n"
-#define XO1(x,y) \
-        "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
-#define XO2(x,y) \
-        "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
-#define XO3(x,y) \
-        "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
-#define XO4(x,y) \
-        "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
-#define XO5(x,y) \
-        "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
-
-       switch(count) {
-               case 2:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-               LD(i,0)                                 \
-                       LD(i+1,1)                       \
-               PF1(i)                                  \
-                               PF1(i+2)                \
-                               LD(i+2,2)               \
-                                       LD(i+3,3)       \
-               PF0(i+4)                                \
-                               PF0(i+6)                \
-               XO1(i,0)                                \
-                       XO1(i+1,1)                      \
-                               XO1(i+2,2)              \
-                                       XO1(i+3,3)      \
-               ST(i,0)                                 \
-                       ST(i+1,1)                       \
-                               ST(i+2,2)               \
-                                       ST(i+3,3)       \
-
-
-               PF0(0)
-                               PF0(2)
-
-       " .align 32,0x90                ;\n"
-        " 1:                            ;\n"
-
-               BLOCK(0)
-               BLOCK(4)
-               BLOCK(8)
-               BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data)
-                       : "memory" );
-                       break;
-               case 3:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-               PF1(i)                                  \
-                               PF1(i+2)                \
-               LD(i,0)                                 \
-                       LD(i+1,1)                       \
-                               LD(i+2,2)               \
-                                       LD(i+3,3)       \
-               PF2(i)                                  \
-                               PF2(i+2)                \
-               PF0(i+4)                                \
-                               PF0(i+6)                \
-               XO1(i,0)                                \
-                       XO1(i+1,1)                      \
-                               XO1(i+2,2)              \
-                                       XO1(i+3,3)      \
-               XO2(i,0)                                \
-                       XO2(i+1,1)                      \
-                               XO2(i+2,2)              \
-                                       XO2(i+3,3)      \
-               ST(i,0)                                 \
-                       ST(i+1,1)                       \
-                               ST(i+2,2)               \
-                                       ST(i+3,3)       \
-
-
-               PF0(0)
-                               PF0(2)
-
-       " .align 32,0x90                ;\n"
-        " 1:                            ;\n"
-
-               BLOCK(0)
-               BLOCK(4)
-               BLOCK(8)
-               BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data),
-                         "r" (bh_ptr[2]->b_data)
-                       : "memory" );
-                       break;
-               case 4:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-               PF1(i)                                  \
-                               PF1(i+2)                \
-               LD(i,0)                                 \
-                       LD(i+1,1)                       \
-                               LD(i+2,2)               \
-                                       LD(i+3,3)       \
-               PF2(i)                                  \
-                               PF2(i+2)                \
-               XO1(i,0)                                \
-                       XO1(i+1,1)                      \
-                               XO1(i+2,2)              \
-                                       XO1(i+3,3)      \
-               PF3(i)                                  \
-                               PF3(i+2)                \
-               PF0(i+4)                                \
-                               PF0(i+6)                \
-               XO2(i,0)                                \
-                       XO2(i+1,1)                      \
-                               XO2(i+2,2)              \
-                                       XO2(i+3,3)      \
-               XO3(i,0)                                \
-                       XO3(i+1,1)                      \
-                               XO3(i+2,2)              \
-                                       XO3(i+3,3)      \
-               ST(i,0)                                 \
-                       ST(i+1,1)                       \
-                               ST(i+2,2)               \
-                                       ST(i+3,3)       \
-
-
-               PF0(0)
-                               PF0(2)
-
-       " .align 32,0x90                ;\n"
-        " 1:                            ;\n"
-
-               BLOCK(0)
-               BLOCK(4)
-               BLOCK(8)
-               BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data),
-                         "r" (bh_ptr[2]->b_data),
-                         "r" (bh_ptr[3]->b_data)
-                       : "memory" );
-                       break;
-               case 5:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-               PF1(i)                                  \
-                               PF1(i+2)                \
-               LD(i,0)                                 \
-                       LD(i+1,1)                       \
-                               LD(i+2,2)               \
-                                       LD(i+3,3)       \
-               PF2(i)                                  \
-                               PF2(i+2)                \
-               XO1(i,0)                                \
-                       XO1(i+1,1)                      \
-                               XO1(i+2,2)              \
-                                       XO1(i+3,3)      \
-               PF3(i)                                  \
-                               PF3(i+2)                \
-               XO2(i,0)                                \
-                       XO2(i+1,1)                      \
-                               XO2(i+2,2)              \
-                                       XO2(i+3,3)      \
-               PF4(i)                                  \
-                               PF4(i+2)                \
-               PF0(i+4)                                \
-                               PF0(i+6)                \
-               XO3(i,0)                                \
-                       XO3(i+1,1)                      \
-                               XO3(i+2,2)              \
-                                       XO3(i+3,3)      \
-               XO4(i,0)                                \
-                       XO4(i+1,1)                      \
-                               XO4(i+2,2)              \
-                                       XO4(i+3,3)      \
-               ST(i,0)                                 \
-                       ST(i+1,1)                       \
-                               ST(i+2,2)               \
-                                       ST(i+3,3)       \
-
-
-               PF0(0)
-                               PF0(2)
-
-       " .align 32,0x90                ;\n"
-        " 1:                            ;\n"
-
-               BLOCK(0)
-               BLOCK(4)
-               BLOCK(8)
-               BLOCK(12)
-
-        "       addl $256, %1           ;\n"
-        "       addl $256, %2           ;\n"
-        "       addl $256, %3           ;\n"
-        "       addl $256, %4           ;\n"
-        "       addl $256, %5           ;\n"
-        "       decl %0                 ;\n"
-        "       jnz 1b                  ;\n"
-
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data),
-                         "r" (bh_ptr[2]->b_data),
-                         "r" (bh_ptr[3]->b_data),
-                         "r" (bh_ptr[4]->b_data)
-                       : "memory");
-                       break;
-       }
-
-       __asm__ __volatile__ ( 
-               "sfence                 ;\n\t"
-               "movups (%1),%%xmm0     ;\n\t"
-               "movups 0x10(%1),%%xmm1 ;\n\t"
-               "movups 0x20(%1),%%xmm2 ;\n\t"
-               "movups 0x30(%1),%%xmm3 ;\n\t"
-               "movl   %0,%%cr0        ;\n\t"
-               :
-               : "r" (cr0), "r" (xmm_save)
-               : "memory" );
-}
-
-#undef OFFS
-#undef LD
-#undef ST
-#undef PF0
-#undef PF1
-#undef PF2
-#undef PF3
-#undef PF4
-#undef PF5
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef XO5
-#undef BLOCK
-
-#endif /* CONFIG_X86_XMM */
-
-/*
- * high-speed RAID5 checksumming functions utilizing MMX instructions
- * Copyright (C) 1998 Ingo Molnar
- */
-XORBLOCK_TEMPLATE(pII_mmx)
-{
-       char fpu_save[108];
-        int lines = (bh_ptr[0]->b_size>>7);
-
-       if (!(current->flags & PF_USEDFPU))
-               __asm__ __volatile__ ( " clts;\n");
-
-       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-#define LD(x,y) \
-        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
-#define ST(x,y) \
-        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
-#define XO1(x,y) \
-        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
-#define XO2(x,y) \
-        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
-#define XO3(x,y) \
-        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
-#define XO4(x,y) \
-        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
-
-       switch(count) {
-               case 2:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-                       LD(i,0)                                 \
-                               LD(i+1,1)                       \
-                                       LD(i+2,2)               \
-                                               LD(i+3,3)       \
-                       XO1(i,0)                                \
-                       ST(i,0)                                 \
-                               XO1(i+1,1)                      \
-                               ST(i+1,1)                       \
-                                       XO1(i+2,2)              \
-                                       ST(i+2,2)               \
-                                               XO1(i+3,3)      \
-                                               ST(i+3,3)
-
-                       " .align 32,0x90                ;\n"
-                       " 1:                            ;\n"
-
-                       BLOCK(0)
-                       BLOCK(4)
-                       BLOCK(8)
-                       BLOCK(12)
-
-                       "       addl $128, %1         ;\n"
-                       "       addl $128, %2         ;\n"
-                       "       decl %0               ;\n"
-                       "       jnz 1b                ;\n"
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data)
-                       : "memory");
-                       break;
-               case 3:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-                       LD(i,0)                                 \
-                               LD(i+1,1)                       \
-                                       LD(i+2,2)               \
-                                               LD(i+3,3)       \
-                       XO1(i,0)                                \
-                               XO1(i+1,1)                      \
-                                       XO1(i+2,2)              \
-                                               XO1(i+3,3)      \
-                       XO2(i,0)                                \
-                       ST(i,0)                                 \
-                               XO2(i+1,1)                      \
-                               ST(i+1,1)                       \
-                                       XO2(i+2,2)              \
-                                       ST(i+2,2)               \
-                                               XO2(i+3,3)      \
-                                               ST(i+3,3)
-
-                       " .align 32,0x90                ;\n"
-                       " 1:                            ;\n"
-
-                       BLOCK(0)
-                       BLOCK(4)
-                       BLOCK(8)
-                       BLOCK(12)
-
-                       "       addl $128, %1         ;\n"
-                       "       addl $128, %2         ;\n"
-                       "       addl $128, %3         ;\n"
-                       "       decl %0               ;\n"
-                       "       jnz 1b                ;\n"
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data),
-                         "r" (bh_ptr[2]->b_data)
-                       : "memory");
-                       break;
-               case 4:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-                       LD(i,0)                                 \
-                               LD(i+1,1)                       \
-                                       LD(i+2,2)               \
-                                               LD(i+3,3)       \
-                       XO1(i,0)                                \
-                               XO1(i+1,1)                      \
-                                       XO1(i+2,2)              \
-                                               XO1(i+3,3)      \
-                       XO2(i,0)                                \
-                               XO2(i+1,1)                      \
-                                       XO2(i+2,2)              \
-                                               XO2(i+3,3)      \
-                       XO3(i,0)                                \
-                       ST(i,0)                                 \
-                               XO3(i+1,1)                      \
-                               ST(i+1,1)                       \
-                                       XO3(i+2,2)              \
-                                       ST(i+2,2)               \
-                                               XO3(i+3,3)      \
-                                               ST(i+3,3)
-
-                       " .align 32,0x90                ;\n"
-                       " 1:                            ;\n"
-
-                       BLOCK(0)
-                       BLOCK(4)
-                       BLOCK(8)
-                       BLOCK(12)
-
-                       "       addl $128, %1         ;\n"
-                       "       addl $128, %2         ;\n"
-                       "       addl $128, %3         ;\n"
-                       "       addl $128, %4         ;\n"
-                       "       decl %0               ;\n"
-                       "       jnz 1b                ;\n"
-                       :
-                       : "r" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data),
-                         "r" (bh_ptr[2]->b_data),
-                         "r" (bh_ptr[3]->b_data)
-                       : "memory");
-                       break;
-               case 5:
-                       __asm__ __volatile__ (
-#undef BLOCK
-#define BLOCK(i) \
-                       LD(i,0)                                 \
-                               LD(i+1,1)                       \
-                                       LD(i+2,2)               \
-                                               LD(i+3,3)       \
-                       XO1(i,0)                                \
-                               XO1(i+1,1)                      \
-                                       XO1(i+2,2)              \
-                                               XO1(i+3,3)      \
-                       XO2(i,0)                                \
-                               XO2(i+1,1)                      \
-                                       XO2(i+2,2)              \
-                                               XO2(i+3,3)      \
-                       XO3(i,0)                                \
-                               XO3(i+1,1)                      \
-                                       XO3(i+2,2)              \
-                                               XO3(i+3,3)      \
-                       XO4(i,0)                                \
-                       ST(i,0)                                 \
-                               XO4(i+1,1)                      \
-                               ST(i+1,1)                       \
-                                       XO4(i+2,2)              \
-                                       ST(i+2,2)               \
-                                               XO4(i+3,3)      \
-                                               ST(i+3,3)
-
-                       " .align 32,0x90                ;\n"
-                       " 1:                            ;\n"
-
-                       BLOCK(0)
-                       BLOCK(4)
-                       BLOCK(8)
-                       BLOCK(12)
-
-                       "       addl $128, %1         ;\n"
-                       "       addl $128, %2         ;\n"
-                       "       addl $128, %3         ;\n"
-                       "       addl $128, %4         ;\n"
-                       "       addl $128, %5         ;\n"
-                       "       decl %0               ;\n"
-                       "       jnz 1b                ;\n"
-                       :
-                       : "g" (lines),
-                         "r" (bh_ptr[0]->b_data),
-                         "r" (bh_ptr[1]->b_data),
-                         "r" (bh_ptr[2]->b_data),
-                         "r" (bh_ptr[3]->b_data),
-                         "r" (bh_ptr[4]->b_data)
-                       : "memory");
-                       break;
-       }
-
-       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-       if (!(current->flags & PF_USEDFPU))
-               stts();
-}
-
-#undef LD
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef ST
-#undef BLOCK
-
-XORBLOCK_TEMPLATE(p5_mmx)
-{
-       char fpu_save[108];
-        int lines = (bh_ptr[0]->b_size>>6);
-
-       if (!(current->flags & PF_USEDFPU))
-               __asm__ __volatile__ ( " clts;\n");
-
-       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
-
-       switch(count) {
-               case 2:
-                       __asm__ __volatile__ (
-
-                               " .align 32,0x90             ;\n"
-                               " 1:                         ;\n"
-                               "       movq   (%1), %%mm0   ;\n"
-                               "       movq  8(%1), %%mm1   ;\n"
-                               "       pxor   (%2), %%mm0   ;\n"
-                               "       movq 16(%1), %%mm2   ;\n"
-                               "       movq %%mm0,   (%1)   ;\n"
-                               "       pxor  8(%2), %%mm1   ;\n"
-                               "       movq 24(%1), %%mm3   ;\n"
-                               "       movq %%mm1,  8(%1)   ;\n"
-                               "       pxor 16(%2), %%mm2   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
-                               "       movq %%mm2, 16(%1)   ;\n"
-                               "       pxor 24(%2), %%mm3   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
-                               "       movq %%mm3, 24(%1)   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
-        
-                               "       addl $64, %1         ;\n"
-                               "       addl $64, %2         ;\n"
-                               "       decl %0              ;\n"
-                               "       jnz 1b               ;\n"
-
-                               : 
-                               : "r" (lines),
-                                 "r" (bh_ptr[0]->b_data),
-                                 "r" (bh_ptr[1]->b_data)
-                               : "memory" );
-                       break;
-               case 3:
-                       __asm__ __volatile__ (
-
-                               " .align 32,0x90             ;\n"
-                               " 1:                         ;\n"
-                               "       movq   (%1), %%mm0   ;\n"
-                               "       movq  8(%1), %%mm1   ;\n"
-                               "       pxor   (%2), %%mm0   ;\n"
-                               "       movq 16(%1), %%mm2   ;\n"
-                               "       pxor  8(%2), %%mm1   ;\n"
-                               "       pxor   (%3), %%mm0   ;\n"
-                               "       pxor 16(%2), %%mm2   ;\n"
-                               "       movq %%mm0,   (%1)   ;\n"
-                               "       pxor  8(%3), %%mm1   ;\n"
-                               "       pxor 16(%3), %%mm2   ;\n"
-                               "       movq 24(%1), %%mm3   ;\n"
-                               "       movq %%mm1,  8(%1)   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
-                               "       pxor 24(%2), %%mm3   ;\n"
-                               "       movq %%mm2, 16(%1)   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
-                               "       pxor 24(%3), %%mm3   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
-                               "       movq %%mm3, 24(%1)   ;\n"
-                               "       pxor 32(%3), %%mm4   ;\n"
-                               "       pxor 40(%3), %%mm5   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       pxor 48(%3), %%mm6   ;\n"
-                               "       pxor 56(%3), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
-        
-                               "       addl $64, %1         ;\n"
-                               "       addl $64, %2         ;\n"
-                               "       addl $64, %3         ;\n"
-                               "       decl %0              ;\n"
-                               "       jnz 1b               ;\n"
-
-                               : 
-                               : "r" (lines),
-                                 "r" (bh_ptr[0]->b_data),
-                                 "r" (bh_ptr[1]->b_data),
-                                 "r" (bh_ptr[2]->b_data)
-                               : "memory" );
-                       break;
-               case 4:
-                       __asm__ __volatile__ (
-
-                               " .align 32,0x90             ;\n"
-                               " 1:                         ;\n"
-                               "       movq   (%1), %%mm0   ;\n"
-                               "       movq  8(%1), %%mm1   ;\n"
-                               "       pxor   (%2), %%mm0   ;\n"
-                               "       movq 16(%1), %%mm2   ;\n"
-                               "       pxor  8(%2), %%mm1   ;\n"
-                               "       pxor   (%3), %%mm0   ;\n"
-                               "       pxor 16(%2), %%mm2   ;\n"
-                               "       pxor  8(%3), %%mm1   ;\n"
-                               "       pxor   (%4), %%mm0   ;\n"
-                               "       movq 24(%1), %%mm3   ;\n"
-                               "       pxor 16(%3), %%mm2   ;\n"
-                               "       pxor  8(%4), %%mm1   ;\n"
-                               "       movq %%mm0,   (%1)   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
-                               "       pxor 24(%2), %%mm3   ;\n"
-                               "       pxor 16(%4), %%mm2   ;\n"
-                               "       movq %%mm1,  8(%1)   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
-                               "       pxor 24(%3), %%mm3   ;\n"
-                               "       movq %%mm2, 16(%1)   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
-                               "       pxor 32(%3), %%mm4   ;\n"
-                               "       pxor 24(%4), %%mm3   ;\n"
-                               "       movq %%mm3, 24(%1)   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       pxor 40(%3), %%mm5   ;\n"
-                               "       pxor 32(%4), %%mm4   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       pxor 40(%4), %%mm5   ;\n"
-                               "       pxor 48(%3), %%mm6   ;\n"
-                               "       pxor 56(%3), %%mm7   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 48(%4), %%mm6   ;\n"
-                               "       pxor 56(%4), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
-        
-                               "       addl $64, %1         ;\n"
-                               "       addl $64, %2         ;\n"
-                               "       addl $64, %3         ;\n"
-                               "       addl $64, %4         ;\n"
-                               "       decl %0              ;\n"
-                               "       jnz 1b               ;\n"
-
-                               : 
-                               : "r" (lines),
-                                 "r" (bh_ptr[0]->b_data),
-                                 "r" (bh_ptr[1]->b_data),
-                                 "r" (bh_ptr[2]->b_data),
-                                 "r" (bh_ptr[3]->b_data)
-                               : "memory" );
-                       break;
-               case 5:
-                       __asm__ __volatile__ (
-
-                               " .align 32,0x90             ;\n"
-                               " 1:                         ;\n"
-                               "       movq   (%1), %%mm0   ;\n"
-                               "       movq  8(%1), %%mm1   ;\n"
-                               "       pxor   (%2), %%mm0   ;\n"
-                               "       pxor  8(%2), %%mm1   ;\n"
-                               "       movq 16(%1), %%mm2   ;\n"
-                               "       pxor   (%3), %%mm0   ;\n"
-                               "       pxor  8(%3), %%mm1   ;\n"
-                               "       pxor 16(%2), %%mm2   ;\n"
-                               "       pxor   (%4), %%mm0   ;\n"
-                               "       pxor  8(%4), %%mm1   ;\n"
-                               "       pxor 16(%3), %%mm2   ;\n"
-                               "       movq 24(%1), %%mm3   ;\n"
-                               "       pxor   (%5), %%mm0   ;\n"
-                               "       pxor  8(%5), %%mm1   ;\n"
-                               "       movq %%mm0,   (%1)   ;\n"
-                               "       pxor 16(%4), %%mm2   ;\n"
-                               "       pxor 24(%2), %%mm3   ;\n"
-                               "       movq %%mm1,  8(%1)   ;\n"
-                               "       pxor 16(%5), %%mm2   ;\n"
-                               "       pxor 24(%3), %%mm3   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
-                               "       movq %%mm2, 16(%1)   ;\n"
-                               "       pxor 24(%4), %%mm3   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
-                               "       pxor 24(%5), %%mm3   ;\n"
-                               "       pxor 32(%3), %%mm4   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
-                               "       movq %%mm3, 24(%1)   ;\n"
-                               "       pxor 32(%4), %%mm4   ;\n"
-                               "       pxor 40(%3), %%mm5   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       pxor 32(%5), %%mm4   ;\n"
-                               "       pxor 40(%4), %%mm5   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       pxor 48(%3), %%mm6   ;\n"
-                               "       pxor 56(%3), %%mm7   ;\n"
-                               "       pxor 40(%5), %%mm5   ;\n"
-                               "       pxor 48(%4), %%mm6   ;\n"
-                               "       pxor 56(%4), %%mm7   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 48(%5), %%mm6   ;\n"
-                               "       pxor 56(%5), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
-        
-                               "       addl $64, %1         ;\n"
-                               "       addl $64, %2         ;\n"
-                               "       addl $64, %3         ;\n"
-                               "       addl $64, %4         ;\n"
-                               "       addl $64, %5         ;\n"
-                               "       decl %0              ;\n"
-                               "       jnz 1b               ;\n"
-
-                               : 
-                               : "g" (lines),
-                                 "r" (bh_ptr[0]->b_data),
-                                 "r" (bh_ptr[1]->b_data),
-                                 "r" (bh_ptr[2]->b_data),
-                                 "r" (bh_ptr[3]->b_data),
-                                 "r" (bh_ptr[4]->b_data)
-                               : "memory" );
-                       break;
-       }
-
-       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-       if (!(current->flags & PF_USEDFPU))
-               stts();
-}
-#endif /* __i386__ */
-#endif /* !__sparc_v9__ */
-
-#ifdef __sparc_v9__
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- *     Requirements:
- *     !(((long)dest | (long)sourceN) & (64 - 1)) &&
- *     !(len & 127) && len >= 256
- *
- * It is done in pure assembly, as otherwise gcc makes it
- * a non-leaf function, which is not what we want.
- * Also, we don't measure the speeds as on other architectures,
- * as the measuring routine does not take into account cold caches
- * and the fact that xor_block_VIS bypasses the caches.
- * xor_block_32regs might be 5% faster for count 2 if caches are hot
- * and things just right (for count 3 VIS is about as fast as 32regs for
- * hot caches and for count 4 and 5 VIS is faster by good margin always),
- * but I think it is better not to pollute the caches.
- * Actually, if I'd just fight for speed for hot caches, I could
- * write a hybrid VIS/integer routine, which would do always two
- * 64B blocks in VIS and two in IEUs, but I really care more about
- * caches.
- */
-extern void *VISenter(void);
-extern void xor_block_VIS XOR_ARGS;
-
-void __xor_block_VIS(void)
-{
-__asm__ ("
-       .globl xor_block_VIS
-xor_block_VIS:
-       ldx     [%%o1 + 0], %%o4
-       ldx     [%%o1 + 8], %%o3
-       ldx     [%%o4 + %1], %%g5
-       ldx     [%%o4 + %0], %%o4
-       ldx     [%%o3 + %0], %%o3
-       rd      %%fprs, %%o5
-       andcc   %%o5, %2, %%g0
-       be,pt   %%icc, 297f
-        sethi  %%hi(%5), %%g1
-       jmpl    %%g1 + %%lo(%5), %%g7
-        add    %%g7, 8, %%g7
-297:   wr      %%g0, %4, %%fprs
-       membar  #LoadStore|#StoreLoad|#StoreStore
-       sub     %%g5, 64, %%g5
-       ldda    [%%o4] %3, %%f0
-       ldda    [%%o3] %3, %%f16
-       cmp     %%o0, 4
-       bgeu,pt %%xcc, 10f
-        cmp    %%o0, 3
-       be,pn   %%xcc, 13f
-        mov    -64, %%g1
-       sub     %%g5, 64, %%g5
-       rd      %%asi, %%g1
-       wr      %%g0, %3, %%asi
-
-2:     ldda    [%%o4 + 64] %%asi, %%f32
-       fxor    %%f0, %%f16, %%f16
-       fxor    %%f2, %%f18, %%f18
-       fxor    %%f4, %%f20, %%f20
-       fxor    %%f6, %%f22, %%f22
-       fxor    %%f8, %%f24, %%f24
-       fxor    %%f10, %%f26, %%f26
-       fxor    %%f12, %%f28, %%f28
-       fxor    %%f14, %%f30, %%f30
-       stda    %%f16, [%%o4] %3
-       ldda    [%%o3 + 64] %%asi, %%f48
-       ldda    [%%o4 + 128] %%asi, %%f0
-       fxor    %%f32, %%f48, %%f48
-       fxor    %%f34, %%f50, %%f50
-       add     %%o4, 128, %%o4
-       fxor    %%f36, %%f52, %%f52
-       add     %%o3, 128, %%o3
-       fxor    %%f38, %%f54, %%f54
-       subcc   %%g5, 128, %%g5
-       fxor    %%f40, %%f56, %%f56
-       fxor    %%f42, %%f58, %%f58
-       fxor    %%f44, %%f60, %%f60
-       fxor    %%f46, %%f62, %%f62
-       stda    %%f48, [%%o4 - 64] %%asi
-       bne,pt  %%xcc, 2b
-        ldda   [%%o3] %3, %%f16
-
-       ldda    [%%o4 + 64] %%asi, %%f32
-       fxor    %%f0, %%f16, %%f16
-       fxor    %%f2, %%f18, %%f18
-       fxor    %%f4, %%f20, %%f20
-       fxor    %%f6, %%f22, %%f22
-       fxor    %%f8, %%f24, %%f24
-       fxor    %%f10, %%f26, %%f26
-       fxor    %%f12, %%f28, %%f28
-       fxor    %%f14, %%f30, %%f30
-       stda    %%f16, [%%o4] %3
-       ldda    [%%o3 + 64] %%asi, %%f48
-       membar  #Sync
-       fxor    %%f32, %%f48, %%f48
-       fxor    %%f34, %%f50, %%f50
-       fxor    %%f36, %%f52, %%f52
-       fxor    %%f38, %%f54, %%f54
-       fxor    %%f40, %%f56, %%f56
-       fxor    %%f42, %%f58, %%f58
-       fxor    %%f44, %%f60, %%f60
-       fxor    %%f46, %%f62, %%f62
-       stda    %%f48, [%%o4 + 64] %%asi
-       membar  #Sync|#StoreStore|#StoreLoad
-       wr      %%g0, 0, %%fprs
-       retl
-        wr     %%g1, %%g0, %%asi
-
-13:    ldx     [%%o1 + 16], %%o2
-       ldx     [%%o2 + %0], %%o2
-
-3:     ldda    [%%o2] %3, %%f32
-       fxor    %%f0, %%f16, %%f48
-       fxor    %%f2, %%f18, %%f50
-       add     %%o4, 64, %%o4
-       fxor    %%f4, %%f20, %%f52
-       fxor    %%f6, %%f22, %%f54
-       add     %%o3, 64, %%o3
-       fxor    %%f8, %%f24, %%f56
-       fxor    %%f10, %%f26, %%f58
-       fxor    %%f12, %%f28, %%f60
-       fxor    %%f14, %%f30, %%f62
-       ldda    [%%o4] %3, %%f0
-       fxor    %%f48, %%f32, %%f48
-       fxor    %%f50, %%f34, %%f50
-       fxor    %%f52, %%f36, %%f52
-       fxor    %%f54, %%f38, %%f54
-       add     %%o2, 64, %%o2
-       fxor    %%f56, %%f40, %%f56
-       fxor    %%f58, %%f42, %%f58
-       subcc   %%g5, 64, %%g5
-       fxor    %%f60, %%f44, %%f60
-       fxor    %%f62, %%f46, %%f62
-       stda    %%f48, [%%o4 + %%g1] %3
-       bne,pt  %%xcc, 3b
-        ldda   [%%o3] %3, %%f16
-
-       ldda    [%%o2] %3, %%f32
-       fxor    %%f0, %%f16, %%f48
-       fxor    %%f2, %%f18, %%f50
-       fxor    %%f4, %%f20, %%f52
-       fxor    %%f6, %%f22, %%f54
-       fxor    %%f8, %%f24, %%f56
-       fxor    %%f10, %%f26, %%f58
-       fxor    %%f12, %%f28, %%f60
-       fxor    %%f14, %%f30, %%f62
-       membar  #Sync
-       fxor    %%f48, %%f32, %%f48
-       fxor    %%f50, %%f34, %%f50
-       fxor    %%f52, %%f36, %%f52
-       fxor    %%f54, %%f38, %%f54
-       fxor    %%f56, %%f40, %%f56
-       fxor    %%f58, %%f42, %%f58
-       fxor    %%f60, %%f44, %%f60
-       fxor    %%f62, %%f46, %%f62
-       stda    %%f48, [%%o4] %3
-       membar  #Sync|#StoreStore|#StoreLoad
-       retl
-        wr     %%g0, 0, %%fprs
-
-10:    cmp     %%o0, 5
-       be,pt   %%xcc, 15f
-        mov    -64, %%g1
-
-14:    ldx     [%%o1 + 16], %%o2
-       ldx     [%%o1 + 24], %%o0
-       ldx     [%%o2 + %0], %%o2
-       ldx     [%%o0 + %0], %%o0
-
-4:     ldda    [%%o2] %3, %%f32
-       fxor    %%f0, %%f16, %%f16
-       fxor    %%f2, %%f18, %%f18
-       add     %%o4, 64, %%o4
-       fxor    %%f4, %%f20, %%f20
-       fxor    %%f6, %%f22, %%f22
-       add     %%o3, 64, %%o3
-       fxor    %%f8, %%f24, %%f24
-       fxor    %%f10, %%f26, %%f26
-       fxor    %%f12, %%f28, %%f28
-       fxor    %%f14, %%f30, %%f30
-       ldda    [%%o0] %3, %%f48
-       fxor    %%f16, %%f32, %%f32
-       fxor    %%f18, %%f34, %%f34
-       fxor    %%f20, %%f36, %%f36
-       fxor    %%f22, %%f38, %%f38
-       add     %%o2, 64, %%o2
-       fxor    %%f24, %%f40, %%f40
-       fxor    %%f26, %%f42, %%f42
-       fxor    %%f28, %%f44, %%f44
-       fxor    %%f30, %%f46, %%f46
-       ldda    [%%o4] %3, %%f0
-       fxor    %%f32, %%f48, %%f48
-       fxor    %%f34, %%f50, %%f50
-       fxor    %%f36, %%f52, %%f52
-       add     %%o0, 64, %%o0
-       fxor    %%f38, %%f54, %%f54
-       fxor    %%f40, %%f56, %%f56
-       fxor    %%f42, %%f58, %%f58
-       subcc   %%g5, 64, %%g5
-       fxor    %%f44, %%f60, %%f60
-       fxor    %%f46, %%f62, %%f62
-       stda    %%f48, [%%o4 + %%g1] %3
-       bne,pt  %%xcc, 4b
-        ldda   [%%o3] %3, %%f16
-
-       ldda    [%%o2] %3, %%f32
-       fxor    %%f0, %%f16, %%f16
-       fxor    %%f2, %%f18, %%f18
-       fxor    %%f4, %%f20, %%f20
-       fxor    %%f6, %%f22, %%f22
-       fxor    %%f8, %%f24, %%f24
-       fxor    %%f10, %%f26, %%f26
-       fxor    %%f12, %%f28, %%f28
-       fxor    %%f14, %%f30, %%f30
-       ldda    [%%o0] %3, %%f48
-       fxor    %%f16, %%f32, %%f32
-       fxor    %%f18, %%f34, %%f34
-       fxor    %%f20, %%f36, %%f36
-       fxor    %%f22, %%f38, %%f38
-       fxor    %%f24, %%f40, %%f40
-       fxor    %%f26, %%f42, %%f42
-       fxor    %%f28, %%f44, %%f44
-       fxor    %%f30, %%f46, %%f46
-       membar  #Sync
-       fxor    %%f32, %%f48, %%f48
-       fxor    %%f34, %%f50, %%f50
-       fxor    %%f36, %%f52, %%f52
-       fxor    %%f38, %%f54, %%f54
-       fxor    %%f40, %%f56, %%f56
-       fxor    %%f42, %%f58, %%f58
-       fxor    %%f44, %%f60, %%f60
-       fxor    %%f46, %%f62, %%f62
-       stda    %%f48, [%%o4] %3
-       membar  #Sync|#StoreStore|#StoreLoad
-       retl
-        wr     %%g0, 0, %%fprs
-
-15:    ldx     [%%o1 + 16], %%o2
-       ldx     [%%o1 + 24], %%o0
-       ldx     [%%o1 + 32], %%o1
-       ldx     [%%o2 + %0], %%o2
-       ldx     [%%o0 + %0], %%o0
-       ldx     [%%o1 + %0], %%o1
-
-5:     ldda    [%%o2] %3, %%f32
-       fxor    %%f0, %%f16, %%f48
-       fxor    %%f2, %%f18, %%f50
-       add     %%o4, 64, %%o4
-       fxor    %%f4, %%f20, %%f52
-       fxor    %%f6, %%f22, %%f54
-       add     %%o3, 64, %%o3
-       fxor    %%f8, %%f24, %%f56
-       fxor    %%f10, %%f26, %%f58
-       fxor    %%f12, %%f28, %%f60
-       fxor    %%f14, %%f30, %%f62
-       ldda    [%%o0] %3, %%f16
-       fxor    %%f48, %%f32, %%f48
-       fxor    %%f50, %%f34, %%f50
-       fxor    %%f52, %%f36, %%f52
-       fxor    %%f54, %%f38, %%f54
-       add     %%o2, 64, %%o2
-       fxor    %%f56, %%f40, %%f56
-       fxor    %%f58, %%f42, %%f58
-       fxor    %%f60, %%f44, %%f60
-       fxor    %%f62, %%f46, %%f62
-       ldda    [%%o1] %3, %%f32
-       fxor    %%f48, %%f16, %%f48
-       fxor    %%f50, %%f18, %%f50
-       add     %%o0, 64, %%o0
-       fxor    %%f52, %%f20, %%f52
-       fxor    %%f54, %%f22, %%f54
-       add     %%o1, 64, %%o1
-       fxor    %%f56, %%f24, %%f56
-       fxor    %%f58, %%f26, %%f58
-       fxor    %%f60, %%f28, %%f60
-       fxor    %%f62, %%f30, %%f62
-       ldda    [%%o4] %3, %%f0
-       fxor    %%f48, %%f32, %%f48
-       fxor    %%f50, %%f34, %%f50
-       fxor    %%f52, %%f36, %%f52
-       fxor    %%f54, %%f38, %%f54
-       fxor    %%f56, %%f40, %%f56
-       fxor    %%f58, %%f42, %%f58
-       subcc   %%g5, 64, %%g5
-       fxor    %%f60, %%f44, %%f60
-       fxor    %%f62, %%f46, %%f62
-       stda    %%f48, [%%o4 + %%g1] %3
-       bne,pt  %%xcc, 5b
-        ldda   [%%o3] %3, %%f16
-
-       ldda    [%%o2] %3, %%f32
-       fxor    %%f0, %%f16, %%f48
-       fxor    %%f2, %%f18, %%f50
-       fxor    %%f4, %%f20, %%f52
-       fxor    %%f6, %%f22, %%f54
-       fxor    %%f8, %%f24, %%f56
-       fxor    %%f10, %%f26, %%f58
-       fxor    %%f12, %%f28, %%f60
-       fxor    %%f14, %%f30, %%f62
-       ldda    [%%o0] %3, %%f16
-       fxor    %%f48, %%f32, %%f48
-       fxor    %%f50, %%f34, %%f50
-       fxor    %%f52, %%f36, %%f52
-       fxor    %%f54, %%f38, %%f54
-       fxor    %%f56, %%f40, %%f56
-       fxor    %%f58, %%f42, %%f58
-       fxor    %%f60, %%f44, %%f60
-       fxor    %%f62, %%f46, %%f62
-       ldda    [%%o1] %3, %%f32
-       fxor    %%f48, %%f16, %%f48
-       fxor    %%f50, %%f18, %%f50
-       fxor    %%f52, %%f20, %%f52
-       fxor    %%f54, %%f22, %%f54
-       fxor    %%f56, %%f24, %%f56
-       fxor    %%f58, %%f26, %%f58
-       fxor    %%f60, %%f28, %%f60
-       fxor    %%f62, %%f30, %%f62
-       membar  #Sync
-       fxor    %%f48, %%f32, %%f48
-       fxor    %%f50, %%f34, %%f50
-       fxor    %%f52, %%f36, %%f52
-       fxor    %%f54, %%f38, %%f54
-       fxor    %%f56, %%f40, %%f56
-       fxor    %%f58, %%f42, %%f58
-       fxor    %%f60, %%f44, %%f60
-       fxor    %%f62, %%f46, %%f62
-       stda    %%f48, [%%o4] %3
-       membar  #Sync|#StoreStore|#StoreLoad
-       retl
-        wr     %%g0, 0, %%fprs
-       " : :
-       "i" (&((struct buffer_head *)0)->b_data),
-       "i" (&((struct buffer_head *)0)->b_size),
-       "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P),
-       "i" (FPRS_FEF), "i" (VISenter));
-}
-#endif /* __sparc_v9__ */
-
-#if defined(__sparc__) && !defined(__sparc_v9__)
-/*
- * High speed xor_block operation for RAID4/5 utilizing the
- * ldd/std SPARC instructions.
- *
- * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- */
-
-XORBLOCK_TEMPLATE(SPARC)
-{
-       int size  = bh_ptr[0]->b_size;
-       int lines = size / (sizeof (long)) / 8, i;
-       long *destp   = (long *) bh_ptr[0]->b_data;
-       long *source1 = (long *) bh_ptr[1]->b_data;
-       long *source2, *source3, *source4;
-
-       switch (count) {
-       case 2:
-               for (i = lines; i > 0; i--) {
-                 __asm__ __volatile__("
-                 ldd [%0 + 0x00], %%g2
-                 ldd [%0 + 0x08], %%g4
-                 ldd [%0 + 0x10], %%o0
-                 ldd [%0 + 0x18], %%o2
-                 ldd [%1 + 0x00], %%o4
-                 ldd [%1 + 0x08], %%l0
-                 ldd [%1 + 0x10], %%l2
-                 ldd [%1 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 std %%g2, [%0 + 0x00]
-                 std %%g4, [%0 + 0x08]
-                 std %%o0, [%0 + 0x10]
-                 std %%o2, [%0 + 0x18]
-                 " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", 
-                 "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5");
-                 destp += 8;
-                 source1 += 8;
-               }
-               break;
-       case 3:
-               source2 = (long *) bh_ptr[2]->b_data;
-               for (i = lines; i > 0; i--) {
-                 __asm__ __volatile__("
-                 ldd [%0 + 0x00], %%g2
-                 ldd [%0 + 0x08], %%g4
-                 ldd [%0 + 0x10], %%o0
-                 ldd [%0 + 0x18], %%o2
-                 ldd [%1 + 0x00], %%o4
-                 ldd [%1 + 0x08], %%l0
-                 ldd [%1 + 0x10], %%l2
-                 ldd [%1 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 ldd [%2 + 0x00], %%o4
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 ldd [%2 + 0x08], %%l0
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 ldd [%2 + 0x10], %%l2
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 ldd [%2 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 std %%g2, [%0 + 0x00]
-                 std %%g4, [%0 + 0x08]
-                 std %%o0, [%0 + 0x10]
-                 std %%o2, [%0 + 0x18]
-                 " : : "r" (destp), "r" (source1), "r" (source2)
-                 : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-                 "l0", "l1", "l2", "l3", "l4", "l5");
-                 destp += 8;
-                 source1 += 8;
-                 source2 += 8;
-               }
-               break;
-       case 4:
-               source2 = (long *) bh_ptr[2]->b_data;
-               source3 = (long *) bh_ptr[3]->b_data;
-               for (i = lines; i > 0; i--) {
-                 __asm__ __volatile__("
-                 ldd [%0 + 0x00], %%g2
-                 ldd [%0 + 0x08], %%g4
-                 ldd [%0 + 0x10], %%o0
-                 ldd [%0 + 0x18], %%o2
-                 ldd [%1 + 0x00], %%o4
-                 ldd [%1 + 0x08], %%l0
-                 ldd [%1 + 0x10], %%l2
-                 ldd [%1 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 ldd [%2 + 0x00], %%o4
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 ldd [%2 + 0x08], %%l0
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 ldd [%2 + 0x10], %%l2
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 ldd [%2 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 ldd [%3 + 0x00], %%o4
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 ldd [%3 + 0x08], %%l0
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 ldd [%3 + 0x10], %%l2
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 ldd [%3 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 std %%g2, [%0 + 0x00]
-                 std %%g4, [%0 + 0x08]
-                 std %%o0, [%0 + 0x10]
-                 std %%o2, [%0 + 0x18]
-                 " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3)
-                 : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-                 "l0", "l1", "l2", "l3", "l4", "l5");
-                 destp += 8;
-                 source1 += 8;
-                 source2 += 8;
-                 source3 += 8;
-               }
-               break;
-       case 5:
-               source2 = (long *) bh_ptr[2]->b_data;
-               source3 = (long *) bh_ptr[3]->b_data;
-               source4 = (long *) bh_ptr[4]->b_data;
-               for (i = lines; i > 0; i--) {
-                 __asm__ __volatile__("
-                 ldd [%0 + 0x00], %%g2
-                 ldd [%0 + 0x08], %%g4
-                 ldd [%0 + 0x10], %%o0
-                 ldd [%0 + 0x18], %%o2
-                 ldd [%1 + 0x00], %%o4
-                 ldd [%1 + 0x08], %%l0
-                 ldd [%1 + 0x10], %%l2
-                 ldd [%1 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 ldd [%2 + 0x00], %%o4
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 ldd [%2 + 0x08], %%l0
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 ldd [%2 + 0x10], %%l2
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 ldd [%2 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 ldd [%3 + 0x00], %%o4
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 ldd [%3 + 0x08], %%l0
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 ldd [%3 + 0x10], %%l2
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 ldd [%3 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 ldd [%4 + 0x00], %%o4
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 ldd [%4 + 0x08], %%l0
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 ldd [%4 + 0x10], %%l2
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 ldd [%4 + 0x18], %%l4
-                 xor %%g2, %%o4, %%g2
-                 xor %%g3, %%o5, %%g3
-                 xor %%g4, %%l0, %%g4
-                 xor %%g5, %%l1, %%g5
-                 xor %%o0, %%l2, %%o0
-                 xor %%o1, %%l3, %%o1
-                 xor %%o2, %%l4, %%o2
-                 xor %%o3, %%l5, %%o3
-                 std %%g2, [%0 + 0x00]
-                 std %%g4, [%0 + 0x08]
-                 std %%o0, [%0 + 0x10]
-                 std %%o2, [%0 + 0x18]
-                 " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4)
-                 : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5",
-                 "l0", "l1", "l2", "l3", "l4", "l5");
-                 destp += 8;
-                 source1 += 8;
-                 source2 += 8;
-                 source3 += 8;
-                 source4 += 8;
-               }
-               break;
-       }
-}
-#endif /* __sparc_v[78]__ */
-
-#ifdef __alpha__
-/*
- * High speed xor_block operation for RAID4/5 pipelined for Alpha EV5.
- * There is a second version using EV6 prefetch instructions.
- *
- * Copyright (C) 2000 Richard Henderson (rth@redhat.com)
- */
-
-XORBLOCK_TEMPLATE(alpha)
-{
-       long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
-       long *d = (long *) bh_ptr[0]->b_data;
-       long *s1 = (long *) bh_ptr[1]->b_data;
-       long *s2, *s3, *s4;
-
-       if (count == 2) goto two_blocks;
-
-       s2 = (long *) bh_ptr[2]->b_data;
-       if (count == 3) goto three_blocks;
-
-       s3 = (long *) bh_ptr[3]->b_data;
-       if (count == 4) goto four_blocks;
-
-       s4 = (long *) bh_ptr[4]->b_data;
-       goto five_blocks;
-
-two_blocks:
-asm volatile ("
-       .align 4
-2:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,8(%0)
-       ldq $3,8(%1)
-
-       ldq $4,16(%0)
-       ldq $5,16(%1)
-       ldq $6,24(%0)
-       ldq $7,24(%1)
-
-       ldq $16,32(%0)
-       ldq $17,32(%1)
-       ldq $18,40(%0)
-       ldq $19,40(%1)
-
-       ldq $20,48(%0)
-       ldq $21,48(%1)
-       ldq $22,56(%0)
-       xor $0,$1,$0            # 7 cycles from $1 load
-
-       ldq $23,56(%1)
-       xor $2,$3,$2
-       stq $0,0(%0)
-       xor $4,$5,$4
-
-       stq $2,8(%0)
-       xor $6,$7,$6
-       stq $4,16(%0)
-       xor $16,$17,$16
-
-       stq $6,24(%0)
-       xor $18,$19,$18
-       stq $16,32(%0)
-       xor $20,$21,$20
-
-       stq $18,40(%0)
-       xor $22,$23,$22
-       stq $20,48(%0)
-       subq %2,1,%2
-
-       stq $22,56(%0)
-       addq %0,64,%0
-       addq %1,64,%1
-       bgt %2,2b"
-       : "=r"(d), "=r"(s1), "=r"(lines)
-       : "0"(d), "1"(s1), "2"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
-       return;
-
-three_blocks:
-asm volatile ("
-       .align 4
-3:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,0(%2)
-       ldq $3,8(%0)
-
-       ldq $4,8(%1)
-       ldq $6,16(%0)
-       ldq $7,16(%1)
-       ldq $17,24(%0)
-
-       ldq $18,24(%1)
-       ldq $20,32(%0)
-       ldq $21,32(%1)
-       ldq $5,8(%2)
-
-       ldq $16,16(%2)
-       ldq $19,24(%2)
-       ldq $22,32(%2)
-       nop
-
-       xor $0,$1,$1            # 8 cycles from $0 load
-       xor $3,$4,$4            # 6 cycles from $4 load
-       xor $6,$7,$7            # 6 cycles from $7 load
-       xor $17,$18,$18         # 5 cycles from $18 load
-
-       xor $1,$2,$2            # 9 cycles from $2 load
-       xor $20,$21,$21         # 5 cycles from $21 load
-       stq $2,0(%0)
-       xor $4,$5,$5            # 6 cycles from $5 load
-
-       stq $5,8(%0)
-       xor $7,$16,$16          # 7 cycles from $16 load
-       stq $16,16(%0)
-       xor $18,$19,$19         # 7 cycles from $19 load
-
-       stq $19,24(%0)
-       xor $21,$22,$22         # 7 cycles from $22 load
-       stq $22,32(%0)
-       nop
-
-       ldq $0,40(%0)
-       ldq $1,40(%1)
-       ldq $3,48(%0)
-       ldq $4,48(%1)
-
-       ldq $6,56(%0)
-       ldq $7,56(%1)
-       ldq $2,40(%2)
-       ldq $5,48(%2)
-
-       ldq $16,56(%2)
-       xor $0,$1,$1            # 4 cycles from $1 load
-       xor $3,$4,$4            # 5 cycles from $4 load
-       xor $6,$7,$7            # 5 cycles from $7 load
-
-       xor $1,$2,$2            # 4 cycles from $2 load
-       xor $4,$5,$5            # 5 cycles from $5 load
-       stq $2,40(%0)
-       xor $7,$16,$16          # 4 cycles from $16 load
-
-       stq $5,48(%0)
-       subq %3,1,%3
-       stq $16,56(%0)
-       addq %2,64,%2
-
-       addq %1,64,%1
-       addq %0,64,%0
-       bgt %3,3b"
-       : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
-       : "0"(d), "1"(s1), "2"(s2), "3"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21", "$22");
-       return;
-
-four_blocks:
-asm volatile ("
-       .align 4
-4:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,0(%2)
-       ldq $3,0(%3)
-
-       ldq $4,8(%0)
-       ldq $5,8(%1)
-       ldq $6,8(%2)
-       ldq $7,8(%3)
-
-       ldq $16,16(%0)
-       ldq $17,16(%1)
-       ldq $18,16(%2)
-       ldq $19,16(%3)
-
-       ldq $20,24(%0)
-       xor $0,$1,$1            # 6 cycles from $1 load
-       ldq $21,24(%1)
-       xor $2,$3,$3            # 6 cycles from $3 load
-
-       ldq $0,24(%2)
-       xor $1,$3,$3
-       ldq $1,24(%3)
-       xor $4,$5,$5            # 7 cycles from $5 load
-
-       stq $3,0(%0)
-       xor $6,$7,$7
-       xor $16,$17,$17         # 7 cycles from $17 load
-       xor $5,$7,$7
-
-       stq $7,8(%0)
-       xor $18,$19,$19         # 7 cycles from $19 load
-       ldq $2,32(%0)
-       xor $17,$19,$19
-
-       ldq $3,32(%1)
-       ldq $4,32(%2)
-       ldq $5,32(%3)
-       xor $20,$21,$21         # 8 cycles from $21 load
-
-       ldq $6,40(%0)
-       ldq $7,40(%1)
-       ldq $16,40(%2)
-       ldq $17,40(%3)
-
-       stq $19,16(%0)
-       xor $0,$1,$1            # 9 cycles from $1 load
-       xor $2,$3,$3            # 5 cycles from $3 load
-       xor $21,$1,$1
-
-       ldq $18,48(%0)
-       xor $4,$5,$5            # 5 cycles from $5 load
-       ldq $19,48(%1)
-       xor $3,$5,$5
-
-       ldq $20,48(%2)
-       ldq $21,48(%3)
-       ldq $0,56(%0)
-       ldq $1,56(%1)
-
-       ldq $2,56(%2)
-       xor $6,$7,$7            # 8 cycles from $6 load
-       ldq $3,56(%3)
-       xor $16,$17,$17         # 8 cycles from $17 load
-
-       xor $7,$17,$17
-       xor $18,$19,$19         # 5 cycles from $19 load
-       xor $20,$21,$21         # 5 cycles from $21 load
-       xor $19,$21,$21
-
-       stq $1,24(%0)
-       xor $0,$1,$1            # 5 cycles from $1 load
-       stq $5,32(%0)
-       xor $2,$3,$3            # 4 cycles from $3 load
-
-       stq $17,40(%0)
-       xor $1,$3,$3
-       stq $21,48(%0)
-       subq %4,1,%4
-
-       stq $3,56(%0)
-       addq %3,64,%3
-       addq %2,64,%2
-       addq %1,64,%1
-
-       addq %0,64,%0
-       bgt %4,4b"
-       : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
-       : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21");
-       return;
-
-five_blocks:
-asm volatile ("
-       ldq %0,0(%6)
-       ldq %1,8(%6)
-       ldq %2,16(%6)
-       ldq %3,24(%6)
-       ldq %4,32(%6)
-       ldq %0,%7(%0)
-       ldq %1,%7(%1)
-       ldq %2,%7(%2)
-       ldq %3,%7(%3)
-       ldq %4,%7(%4)
-       .align 4
-5:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,0(%2)
-       ldq $3,0(%3)
-
-       ldq $4,0(%4)
-       ldq $5,8(%0)
-       ldq $6,8(%1)
-       ldq $7,8(%2)
-
-       ldq $16,8(%3)
-       ldq $17,8(%4)
-       ldq $18,16(%0)
-       ldq $19,16(%1)
-
-       ldq $20,16(%2)
-       xor $0,$1,$1            # 6 cycles from $1 load
-       ldq $21,16(%3)
-       xor $2,$3,$3            # 6 cycles from $3 load
-
-       ldq $0,16(%4)
-       xor $1,$3,$3
-       ldq $1,24(%0)
-       xor $3,$4,$4            # 7 cycles from $4 load
-
-       stq $4,0(%0)
-       xor $5,$6,$6            # 7 cycles from $6 load
-       xor $7,$16,$16          # 7 cycles from $16 load
-       xor $6,$17,$17          # 7 cycles from $17 load
-
-       ldq $2,24(%1)
-       xor $16,$17,$17
-       ldq $3,24(%2)
-       xor $18,$19,$19         # 8 cycles from $19 load
-
-       stq $17,8(%0)
-       xor $19,$20,$20         # 8 cycles from $20 load
-       ldq $4,24(%3)
-       xor $21,$0,$0           # 7 cycles from $0 load
-
-       ldq $5,24(%4)
-       xor $20,$0,$0
-       ldq $6,32(%0)
-       ldq $7,32(%1)
-
-       stq $0,16(%0)
-       xor $1,$2,$2            # 6 cycles from $2 load
-       ldq $16,32(%2)
-       xor $3,$4,$4            # 4 cycles from $4 load
-       
-       ldq $17,32(%3)
-       xor $2,$4,$4
-       ldq $18,32(%4)
-       ldq $19,40(%0)
-
-       ldq $20,40(%1)
-       ldq $21,40(%2)
-       ldq $0,40(%3)
-       xor $4,$5,$5            # 7 cycles from $5 load
-
-       stq $5,24(%0)
-       xor $6,$7,$7            # 7 cycles from $7 load
-       ldq $1,40(%4)
-       ldq $2,48(%0)
-
-       ldq $3,48(%1)
-       xor $7,$16,$16          # 7 cycles from $16 load
-       ldq $4,48(%2)
-       xor $17,$18,$18         # 6 cycles from $18 load
-
-       ldq $5,48(%3)
-       xor $16,$18,$18
-       ldq $6,48(%4)
-       xor $19,$20,$20         # 7 cycles from $20 load
-
-       stq $18,32(%0)
-       xor $20,$21,$21         # 8 cycles from $21 load
-       ldq $7,56(%0)
-       xor $0,$1,$1            # 6 cycles from $1 load
-
-       ldq $16,56(%1)
-       ldq $17,56(%2)
-       ldq $18,56(%3)
-       ldq $19,56(%4)
-
-       xor $21,$1,$1
-       xor $2,$3,$3            # 9 cycles from $3 load
-       xor $3,$4,$4            # 9 cycles from $4 load
-       xor $5,$6,$6            # 8 cycles from $6 load
-
-       unop
-       xor $4,$6,$6
-       xor $7,$16,$16          # 7 cycles from $16 load
-       xor $17,$18,$18         # 6 cycles from $18 load
-
-       stq $6,48(%0)
-       xor $16,$18,$18
-       subq %5,1,%5
-       xor $18,$19,$19         # 8 cycles from $19 load
-
-       stq $19,56(%0)
-       addq %4,64,%4
-       addq %3,64,%3
-       addq %2,64,%2
-
-       addq %1,64,%1
-       addq %0,64,%0
-       bgt %5,5b"
-       : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
-       /* ARG! We've run out of asm arguments!  We've got to reload
-          all those pointers we just loaded.  */
-       : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21");
-       return;
-}
-
-#define prefetch(base, ofs) \
-       asm("ldq $31,%2(%0)" : "=r"(base) : "0"(base), "i"(ofs))
-
-XORBLOCK_TEMPLATE(alpha_prefetch)
-{
-       long lines = bh_ptr[0]->b_size / sizeof (long) / 8;
-       long *d = (long *) bh_ptr[0]->b_data;
-       long *s1 = (long *) bh_ptr[1]->b_data;
-       long *s2, *s3, *s4;
-       long p;
-
-       p = count == 2;
-       prefetch(d, 0);
-       prefetch(s1, 0);
-       prefetch(d, 64);
-       prefetch(s1, 64);
-       prefetch(d, 128);
-       prefetch(s1, 128);
-       prefetch(d, 192);
-       prefetch(s1, 192);
-       if (p) goto two_blocks;
-
-       s2 = (long *) bh_ptr[2]->b_data;
-       p = count == 3;
-       prefetch(s2, 0);
-       prefetch(s2, 64);
-       prefetch(s2, 128);
-       prefetch(s2, 192);
-       if (p) goto three_blocks;
-
-       s3 = (long *) bh_ptr[3]->b_data;
-       p = count == 4;
-       prefetch(s3, 0);
-       prefetch(s3, 64);
-       prefetch(s3, 128);
-       prefetch(s3, 192);
-       if (p) goto four_blocks;
-
-       s4 = (long *) bh_ptr[4]->b_data;
-       prefetch(s4, 0);
-       prefetch(s4, 64);
-       prefetch(s4, 128);
-       prefetch(s4, 192);
-       goto five_blocks;
-
-two_blocks:
-asm volatile ("
-       .align 4
-2:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,8(%0)
-       ldq $3,8(%1)
-
-       ldq $4,16(%0)
-       ldq $5,16(%1)
-       ldq $6,24(%0)
-       ldq $7,24(%1)
-
-       ldq $16,32(%0)
-       ldq $17,32(%1)
-       ldq $18,40(%0)
-       ldq $19,40(%1)
-
-       ldq $20,48(%0)
-       ldq $21,48(%1)
-       ldq $22,56(%0)
-       ldq $23,56(%1)
-
-       ldq $31,256(%0)
-       xor $0,$1,$0            # 8 cycles from $1 load
-       ldq $31,256(%1)
-       xor $2,$3,$2
-
-       stq $0,0(%0)
-       xor $4,$5,$4
-       stq $2,8(%0)
-       xor $6,$7,$6
-
-       stq $4,16(%0)
-       xor $16,$17,$16
-       stq $6,24(%0)
-       xor $18,$19,$18
-
-       stq $16,32(%0)
-       xor $20,$21,$20
-       stq $18,40(%0)
-       xor $22,$23,$22
-
-       stq $20,48(%0)
-       subq %2,1,%2
-       stq $22,56(%0)
-       addq %0,64,%0
-
-       addq %1,64,%1
-       bgt %2,2b"
-       : "=r"(d), "=r"(s1), "=r"(lines)
-       : "0"(d), "1"(s1), "2"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23");
-       return;
-
-three_blocks:
-asm volatile ("
-       .align 4
-3:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,0(%2)
-       ldq $3,8(%0)
-
-       ldq $4,8(%1)
-       ldq $6,16(%0)
-       ldq $7,16(%1)
-       ldq $17,24(%0)
-
-       ldq $18,24(%1)
-       ldq $20,32(%0)
-       ldq $21,32(%1)
-       ldq $5,8(%2)
-
-       ldq $16,16(%2)
-       ldq $19,24(%2)
-       ldq $22,32(%2)
-       nop
-
-       xor $0,$1,$1            # 8 cycles from $0 load
-       xor $3,$4,$4            # 7 cycles from $4 load
-       xor $6,$7,$7            # 6 cycles from $7 load
-       xor $17,$18,$18         # 5 cycles from $18 load
-
-       xor $1,$2,$2            # 9 cycles from $2 load
-       xor $20,$21,$21         # 5 cycles from $21 load
-       stq $2,0(%0)
-       xor $4,$5,$5            # 6 cycles from $5 load
-
-       stq $5,8(%0)
-       xor $7,$16,$16          # 7 cycles from $16 load
-       stq $16,16(%0)
-       xor $18,$19,$19         # 7 cycles from $19 load
-
-       stq $19,24(%0)
-       xor $21,$22,$22         # 7 cycles from $22 load
-       stq $22,32(%0)
-       nop
-
-       ldq $0,40(%0)
-       ldq $1,40(%1)
-       ldq $3,48(%0)
-       ldq $4,48(%1)
-
-       ldq $6,56(%0)
-       ldq $7,56(%1)
-       ldq $2,40(%2)
-       ldq $5,48(%2)
-
-       ldq $16,56(%2)
-       ldq $31,256(%0)
-       ldq $31,256(%1)
-       ldq $31,256(%2)
-
-       xor $0,$1,$1            # 6 cycles from $1 load
-       xor $3,$4,$4            # 5 cycles from $4 load
-       xor $6,$7,$7            # 5 cycles from $7 load
-       xor $1,$2,$2            # 4 cycles from $2 load
-       
-       xor $4,$5,$5            # 5 cycles from $5 load
-       xor $7,$16,$16          # 4 cycles from $16 load
-       stq $2,40(%0)
-       subq %3,1,%3
-
-       stq $5,48(%0)
-       addq %2,64,%2
-       stq $16,56(%0)
-       addq %1,64,%1
-
-       addq %0,64,%0
-       bgt %3,3b"
-       : "=r"(d), "=r"(s1), "=r"(s2), "=r"(lines)
-       : "0"(d), "1"(s1), "2"(s2), "3"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21", "$22");
-       return;
-
-four_blocks:
-asm volatile ("
-       .align 4
-4:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,0(%2)
-       ldq $3,0(%3)
-
-       ldq $4,8(%0)
-       ldq $5,8(%1)
-       ldq $6,8(%2)
-       ldq $7,8(%3)
-
-       ldq $16,16(%0)
-       ldq $17,16(%1)
-       ldq $18,16(%2)
-       ldq $19,16(%3)
-
-       ldq $20,24(%0)
-       xor $0,$1,$1            # 6 cycles from $1 load
-       ldq $21,24(%1)
-       xor $2,$3,$3            # 6 cycles from $3 load
-
-       ldq $0,24(%2)
-       xor $1,$3,$3
-       ldq $1,24(%3)
-       xor $4,$5,$5            # 7 cycles from $5 load
-
-       stq $3,0(%0)
-       xor $6,$7,$7
-       xor $16,$17,$17         # 7 cycles from $17 load
-       xor $5,$7,$7
-
-       stq $7,8(%0)
-       xor $18,$19,$19         # 7 cycles from $19 load
-       ldq $2,32(%0)
-       xor $17,$19,$19
-
-       ldq $3,32(%1)
-       ldq $4,32(%2)
-       ldq $5,32(%3)
-       xor $20,$21,$21         # 8 cycles from $21 load
-
-       ldq $6,40(%0)
-       ldq $7,40(%1)
-       ldq $16,40(%2)
-       ldq $17,40(%3)
-
-       stq $19,16(%0)
-       xor $0,$1,$1            # 9 cycles from $1 load
-       xor $2,$3,$3            # 5 cycles from $3 load
-       xor $21,$1,$1
-
-       ldq $18,48(%0)
-       xor $4,$5,$5            # 5 cycles from $5 load
-       ldq $19,48(%1)
-       xor $3,$5,$5
-
-       ldq $20,48(%2)
-       ldq $21,48(%3)
-       ldq $0,56(%0)
-       ldq $1,56(%1)
-
-       ldq $2,56(%2)
-       xor $6,$7,$7            # 8 cycles from $6 load
-       ldq $3,56(%3)
-       xor $16,$17,$17         # 8 cycles from $17 load
-
-       ldq $31,256(%0)
-       xor $7,$17,$17
-       ldq $31,256(%1)
-       xor $18,$19,$19         # 6 cycles from $19 load
-
-       ldq $31,256(%2)
-       xor $20,$21,$21         # 6 cycles from $21 load
-       ldq $31,256(%3)
-       xor $19,$21,$21
-
-       stq $1,24(%0)
-       xor $0,$1,$1            # 7 cycles from $1 load
-       stq $5,32(%0)
-       xor $2,$3,$3            # 6 cycles from $3 load
-
-       stq $17,40(%0)
-       xor $1,$3,$3
-       stq $21,48(%0)
-       subq %4,1,%4
-
-       stq $3,56(%0)
-       addq %3,64,%3
-       addq %2,64,%2
-       addq %1,64,%1
-
-       addq %0,64,%0
-       bgt %4,4b"
-       : "=r"(d), "=r"(s1), "=r"(s2), "=r"(s3), "=r"(lines)
-       : "0"(d), "1"(s1), "2"(s2), "3"(s3), "4"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21");
-       return;
-
-five_blocks:
-asm volatile ("
-       ldq %0,0(%6)
-       ldq %1,8(%6)
-       ldq %2,16(%6)
-       ldq %3,24(%6)
-       ldq %4,32(%6)
-       ldq %0,%7(%0)
-       ldq %1,%7(%1)
-       ldq %2,%7(%2)
-       ldq %3,%7(%3)
-       ldq %4,%7(%4)
-       .align 4
-5:
-       ldq $0,0(%0)
-       ldq $1,0(%1)
-       ldq $2,0(%2)
-       ldq $3,0(%3)
-
-       ldq $4,0(%4)
-       ldq $5,8(%0)
-       ldq $6,8(%1)
-       ldq $7,8(%2)
-
-       ldq $16,8(%3)
-       ldq $17,8(%4)
-       ldq $18,16(%0)
-       ldq $19,16(%1)
-
-       ldq $20,16(%2)
-       xor $0,$1,$1            # 6 cycles from $1 load
-       ldq $21,16(%3)
-       xor $2,$3,$3            # 6 cycles from $3 load
-
-       ldq $0,16(%4)
-       xor $1,$3,$3
-       ldq $1,24(%0)
-       xor $3,$4,$4            # 7 cycles from $4 load
-
-       stq $4,0(%0)
-       xor $5,$6,$6            # 7 cycles from $6 load
-       xor $7,$16,$16          # 7 cycles from $16 load
-       xor $6,$17,$17          # 7 cycles from $17 load
-
-       ldq $2,24(%1)
-       xor $16,$17,$17
-       ldq $3,24(%2)
-       xor $18,$19,$19         # 8 cycles from $19 load
-
-       stq $17,8(%0)
-       xor $19,$20,$20         # 8 cycles from $20 load
-       ldq $4,24(%3)
-       xor $21,$0,$0           # 7 cycles from $0 load
-
-       ldq $5,24(%4)
-       xor $20,$0,$0
-       ldq $6,32(%0)
-       ldq $7,32(%1)
-
-       stq $0,16(%0)
-       xor $1,$2,$2            # 6 cycles from $2 load
-       ldq $16,32(%2)
-       xor $3,$4,$4            # 4 cycles from $4 load
-       
-       ldq $17,32(%3)
-       xor $2,$4,$4
-       ldq $18,32(%4)
-       ldq $19,40(%0)
-
-       ldq $20,40(%1)
-       ldq $21,40(%2)
-       ldq $0,40(%3)
-       xor $4,$5,$5            # 7 cycles from $5 load
-
-       stq $5,24(%0)
-       xor $6,$7,$7            # 7 cycles from $7 load
-       ldq $1,40(%4)
-       ldq $2,48(%0)
-
-       ldq $3,48(%1)
-       xor $7,$16,$16          # 7 cycles from $16 load
-       ldq $4,48(%2)
-       xor $17,$18,$18         # 6 cycles from $18 load
-
-       ldq $5,48(%3)
-       xor $16,$18,$18
-       ldq $6,48(%4)
-       xor $19,$20,$20         # 7 cycles from $20 load
-
-       stq $18,32(%0)
-       xor $20,$21,$21         # 8 cycles from $21 load
-       ldq $7,56(%0)
-       xor $0,$1,$1            # 6 cycles from $1 load
-
-       ldq $16,56(%1)
-       ldq $17,56(%2)
-       ldq $18,56(%3)
-       ldq $19,56(%4)
-
-       ldq $31,256(%0)
-       xor $21,$1,$1
-       ldq $31,256(%1)
-       xor $2,$3,$3            # 9 cycles from $3 load
-
-       ldq $31,256(%2)
-       xor $3,$4,$4            # 9 cycles from $4 load
-       ldq $31,256(%3)
-       xor $5,$6,$6            # 8 cycles from $6 load
-
-       ldq $31,256(%4)
-       xor $4,$6,$6
-       xor $7,$16,$16          # 7 cycles from $16 load
-       xor $17,$18,$18         # 6 cycles from $18 load
-
-       stq $6,48(%0)
-       xor $16,$18,$18
-       subq %5,1,%5
-       xor $18,$19,$19         # 8 cycles from $19 load
-
-       stq $19,56(%0)
-       addq %4,64,%4
-       addq %3,64,%3
-       addq %2,64,%2
-
-       addq %1,64,%1
-       addq %0,64,%0
-       bgt %5,5b"
-       : "=&r"(d), "=&r"(s1), "=&r"(s2), "=&r"(s3), "=r"(s4), "=r"(lines)
-       /* ARG! We've run out of asm arguments!  We've got to reload
-          all those pointers we just loaded.  */
-       : "r"(bh_ptr), "i" (&((struct buffer_head *)0)->b_data), "5"(lines)
-       : "memory", "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
-         "$16", "$17", "$18", "$19", "$20", "$21");
-       return;
-}
-
-#undef prefetch
-
-#endif /* __alpha__ */
-
-#ifndef __sparc_v9__
-
-/*
- * this one works reasonably on any x86 CPU
- * (send me an assembly version for inclusion if you can make it faster)
- *
- * this one is just as fast as written in pure assembly on x86.
- * the reason for this separate version is that the
- * fast open-coded xor routine "32reg" produces suboptimal code
- * on x86, due to lack of registers.
- */
-XORBLOCK_TEMPLATE(8regs)
-{
-       int len  = bh_ptr[0]->b_size;
-       long *destp   = (long *) bh_ptr[0]->b_data;
-       long *source1, *source2, *source3, *source4;
-       long lines = len / (sizeof (long)) / 8, i;
-
-       switch(count) {
-               case 2:
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               *(destp + 0) ^= *(source1 + 0);
-                               *(destp + 1) ^= *(source1 + 1);
-                               *(destp + 2) ^= *(source1 + 2);
-                               *(destp + 3) ^= *(source1 + 3);
-                               *(destp + 4) ^= *(source1 + 4);
-                               *(destp + 5) ^= *(source1 + 5);
-                               *(destp + 6) ^= *(source1 + 6);
-                               *(destp + 7) ^= *(source1 + 7);
-                               source1 += 8;
-                               destp += 8;
-                       }
-                       break;
-               case 3:
-                       source2 = (long *) bh_ptr[2]->b_data;
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               *(destp + 0) ^= *(source1 + 0);
-                               *(destp + 0) ^= *(source2 + 0);
-                               *(destp + 1) ^= *(source1 + 1);
-                               *(destp + 1) ^= *(source2 + 1);
-                               *(destp + 2) ^= *(source1 + 2);
-                               *(destp + 2) ^= *(source2 + 2);
-                               *(destp + 3) ^= *(source1 + 3);
-                               *(destp + 3) ^= *(source2 + 3);
-                               *(destp + 4) ^= *(source1 + 4);
-                               *(destp + 4) ^= *(source2 + 4);
-                               *(destp + 5) ^= *(source1 + 5);
-                               *(destp + 5) ^= *(source2 + 5);
-                               *(destp + 6) ^= *(source1 + 6);
-                               *(destp + 6) ^= *(source2 + 6);
-                               *(destp + 7) ^= *(source1 + 7);
-                               *(destp + 7) ^= *(source2 + 7);
-                               source1 += 8;
-                               source2 += 8;
-                               destp += 8;
-                       }
-                       break;
-               case 4:
-                       source3 = (long *) bh_ptr[3]->b_data;
-                       source2 = (long *) bh_ptr[2]->b_data;
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               *(destp + 0) ^= *(source1 + 0);
-                               *(destp + 0) ^= *(source2 + 0);
-                               *(destp + 0) ^= *(source3 + 0);
-                               *(destp + 1) ^= *(source1 + 1);
-                               *(destp + 1) ^= *(source2 + 1);
-                               *(destp + 1) ^= *(source3 + 1);
-                               *(destp + 2) ^= *(source1 + 2);
-                               *(destp + 2) ^= *(source2 + 2);
-                               *(destp + 2) ^= *(source3 + 2);
-                               *(destp + 3) ^= *(source1 + 3);
-                               *(destp + 3) ^= *(source2 + 3);
-                               *(destp + 3) ^= *(source3 + 3);
-                               *(destp + 4) ^= *(source1 + 4);
-                               *(destp + 4) ^= *(source2 + 4);
-                               *(destp + 4) ^= *(source3 + 4);
-                               *(destp + 5) ^= *(source1 + 5);
-                               *(destp + 5) ^= *(source2 + 5);
-                               *(destp + 5) ^= *(source3 + 5);
-                               *(destp + 6) ^= *(source1 + 6);
-                               *(destp + 6) ^= *(source2 + 6);
-                               *(destp + 6) ^= *(source3 + 6);
-                               *(destp + 7) ^= *(source1 + 7);
-                               *(destp + 7) ^= *(source2 + 7);
-                               *(destp + 7) ^= *(source3 + 7);
-                               source1 += 8;
-                               source2 += 8;
-                               source3 += 8;
-                               destp += 8;
-                       }
-                       break;
-               case 5:
-                       source4 = (long *) bh_ptr[4]->b_data;
-                       source3 = (long *) bh_ptr[3]->b_data;
-                       source2 = (long *) bh_ptr[2]->b_data;
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               *(destp + 0) ^= *(source1 + 0);
-                               *(destp + 0) ^= *(source2 + 0);
-                               *(destp + 0) ^= *(source3 + 0);
-                               *(destp + 0) ^= *(source4 + 0);
-                               *(destp + 1) ^= *(source1 + 1);
-                               *(destp + 1) ^= *(source2 + 1);
-                               *(destp + 1) ^= *(source3 + 1);
-                               *(destp + 1) ^= *(source4 + 1);
-                               *(destp + 2) ^= *(source1 + 2);
-                               *(destp + 2) ^= *(source2 + 2);
-                               *(destp + 2) ^= *(source3 + 2);
-                               *(destp + 2) ^= *(source4 + 2);
-                               *(destp + 3) ^= *(source1 + 3);
-                               *(destp + 3) ^= *(source2 + 3);
-                               *(destp + 3) ^= *(source3 + 3);
-                               *(destp + 3) ^= *(source4 + 3);
-                               *(destp + 4) ^= *(source1 + 4);
-                               *(destp + 4) ^= *(source2 + 4);
-                               *(destp + 4) ^= *(source3 + 4);
-                               *(destp + 4) ^= *(source4 + 4);
-                               *(destp + 5) ^= *(source1 + 5);
-                               *(destp + 5) ^= *(source2 + 5);
-                               *(destp + 5) ^= *(source3 + 5);
-                               *(destp + 5) ^= *(source4 + 5);
-                               *(destp + 6) ^= *(source1 + 6);
-                               *(destp + 6) ^= *(source2 + 6);
-                               *(destp + 6) ^= *(source3 + 6);
-                               *(destp + 6) ^= *(source4 + 6);
-                               *(destp + 7) ^= *(source1 + 7);
-                               *(destp + 7) ^= *(source2 + 7);
-                               *(destp + 7) ^= *(source3 + 7);
-                               *(destp + 7) ^= *(source4 + 7);
-                               source1 += 8;
-                               source2 += 8;
-                               source3 += 8;
-                               source4 += 8;
-                               destp += 8;
-                       }
-                       break;
-       }
-}
-
-/*
- * platform independent RAID5 checksum calculation, this should
- * be very fast on any platform that has a decent amount of
- * registers. (32 or more)
- */
-XORBLOCK_TEMPLATE(32regs)
-{
-       int size  = bh_ptr[0]->b_size;
-       int lines = size / (sizeof (long)) / 8, i;
-       long *destp   = (long *) bh_ptr[0]->b_data;
-       long *source1, *source2, *source3, *source4;
-       
-         /* LOTS of registers available...
-            We do explicite loop-unrolling here for code which
-            favours RISC machines.  In fact this is almoast direct
-            RISC assembly on Alpha and SPARC :-)  */
-
-
-       switch(count) {
-               case 2:
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               register long d0, d1, d2, d3, d4, d5, d6, d7;
-                               d0 = destp[0];  /* Pull the stuff into registers        */
-                               d1 = destp[1];  /*  ... in bursts, if possible.         */
-                               d2 = destp[2];
-                               d3 = destp[3];
-                               d4 = destp[4];
-                               d5 = destp[5];
-                               d6 = destp[6];
-                               d7 = destp[7];
-                               d0 ^= source1[0];
-                               d1 ^= source1[1];
-                               d2 ^= source1[2];
-                               d3 ^= source1[3];
-                               d4 ^= source1[4];
-                               d5 ^= source1[5];
-                               d6 ^= source1[6];
-                               d7 ^= source1[7];
-                               destp[0] = d0;  /* Store the result (in burts)          */
-                               destp[1] = d1;
-                               destp[2] = d2;
-                               destp[3] = d3;
-                               destp[4] = d4;  /* Store the result (in burts)          */
-                               destp[5] = d5;
-                               destp[6] = d6;
-                               destp[7] = d7;
-                               source1 += 8;
-                               destp += 8;
-                       }
-                       break;
-               case 3:
-                       source2 = (long *) bh_ptr[2]->b_data;
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               register long d0, d1, d2, d3, d4, d5, d6, d7;
-                               d0 = destp[0];  /* Pull the stuff into registers        */
-                               d1 = destp[1];  /*  ... in bursts, if possible.         */
-                               d2 = destp[2];
-                               d3 = destp[3];
-                               d4 = destp[4];
-                               d5 = destp[5];
-                               d6 = destp[6];
-                               d7 = destp[7];
-                               d0 ^= source1[0];
-                               d1 ^= source1[1];
-                               d2 ^= source1[2];
-                               d3 ^= source1[3];
-                               d4 ^= source1[4];
-                               d5 ^= source1[5];
-                               d6 ^= source1[6];
-                               d7 ^= source1[7];
-                               d0 ^= source2[0];
-                               d1 ^= source2[1];
-                               d2 ^= source2[2];
-                               d3 ^= source2[3];
-                               d4 ^= source2[4];
-                               d5 ^= source2[5];
-                               d6 ^= source2[6];
-                               d7 ^= source2[7];
-                               destp[0] = d0;  /* Store the result (in burts)          */
-                               destp[1] = d1;
-                               destp[2] = d2;
-                               destp[3] = d3;
-                               destp[4] = d4;  /* Store the result (in burts)          */
-                               destp[5] = d5;
-                               destp[6] = d6;
-                               destp[7] = d7;
-                               source1 += 8;
-                               source2 += 8;
-                               destp += 8;
-                       }
-                       break;
-               case 4:
-                       source3 = (long *) bh_ptr[3]->b_data;
-                       source2 = (long *) bh_ptr[2]->b_data;
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               register long d0, d1, d2, d3, d4, d5, d6, d7;
-                               d0 = destp[0];  /* Pull the stuff into registers        */
-                               d1 = destp[1];  /*  ... in bursts, if possible.         */
-                               d2 = destp[2];
-                               d3 = destp[3];
-                               d4 = destp[4];
-                               d5 = destp[5];
-                               d6 = destp[6];
-                               d7 = destp[7];
-                               d0 ^= source1[0];
-                               d1 ^= source1[1];
-                               d2 ^= source1[2];
-                               d3 ^= source1[3];
-                               d4 ^= source1[4];
-                               d5 ^= source1[5];
-                               d6 ^= source1[6];
-                               d7 ^= source1[7];
-                               d0 ^= source2[0];
-                               d1 ^= source2[1];
-                               d2 ^= source2[2];
-                               d3 ^= source2[3];
-                               d4 ^= source2[4];
-                               d5 ^= source2[5];
-                               d6 ^= source2[6];
-                               d7 ^= source2[7];
-                               d0 ^= source3[0];
-                               d1 ^= source3[1];
-                               d2 ^= source3[2];
-                               d3 ^= source3[3];
-                               d4 ^= source3[4];
-                               d5 ^= source3[5];
-                               d6 ^= source3[6];
-                               d7 ^= source3[7];
-                               destp[0] = d0;  /* Store the result (in burts)          */
-                               destp[1] = d1;
-                               destp[2] = d2;
-                               destp[3] = d3;
-                               destp[4] = d4;  /* Store the result (in burts)          */
-                               destp[5] = d5;
-                               destp[6] = d6;
-                               destp[7] = d7;
-                               source1 += 8;
-                               source2 += 8;
-                               source3 += 8;
-                               destp += 8;
-                       }
-                       break;
-               case 5:
-                       source4 = (long *) bh_ptr[4]->b_data;
-                       source3 = (long *) bh_ptr[3]->b_data;
-                       source2 = (long *) bh_ptr[2]->b_data;
-                       source1 = (long *) bh_ptr[1]->b_data;
-                       for (i = lines; i > 0; i--) {
-                               register long d0, d1, d2, d3, d4, d5, d6, d7;
-                               d0 = destp[0];  /* Pull the stuff into registers        */
-                               d1 = destp[1];  /*  ... in bursts, if possible.         */
-                               d2 = destp[2];
-                               d3 = destp[3];
-                               d4 = destp[4];
-                               d5 = destp[5];
-                               d6 = destp[6];
-                               d7 = destp[7];
-                               d0 ^= source1[0];
-                               d1 ^= source1[1];
-                               d2 ^= source1[2];
-                               d3 ^= source1[3];
-                               d4 ^= source1[4];
-                               d5 ^= source1[5];
-                               d6 ^= source1[6];
-                               d7 ^= source1[7];
-                               d0 ^= source2[0];
-                               d1 ^= source2[1];
-                               d2 ^= source2[2];
-                               d3 ^= source2[3];
-                               d4 ^= source2[4];
-                               d5 ^= source2[5];
-                               d6 ^= source2[6];
-                               d7 ^= source2[7];
-                               d0 ^= source3[0];
-                               d1 ^= source3[1];
-                               d2 ^= source3[2];
-                               d3 ^= source3[3];
-                               d4 ^= source3[4];
-                               d5 ^= source3[5];
-                               d6 ^= source3[6];
-                               d7 ^= source3[7];
-                               d0 ^= source4[0];
-                               d1 ^= source4[1];
-                               d2 ^= source4[2];
-                               d3 ^= source4[3];
-                               d4 ^= source4[4];
-                               d5 ^= source4[5];
-                               d6 ^= source4[6];
-                               d7 ^= source4[7];
-                               destp[0] = d0;  /* Store the result (in burts)          */
-                               destp[1] = d1;
-                               destp[2] = d2;
-                               destp[3] = d3;
-                               destp[4] = d4;  /* Store the result (in burts)          */
-                               destp[5] = d5;
-                               destp[6] = d6;
-                               destp[7] = d7;
-                               source1 += 8;
-                               source2 += 8;
-                               source3 += 8;
-                               source4 += 8;
-                               destp += 8;
-                       }
-                       break;
-       }
-}
-
-/*
- * (the -6*32 shift factor colors the cache)
- */
-#define SIZE (PAGE_SIZE-6*32)
-
-static void xor_speed ( struct xor_block_template * func, 
-       struct buffer_head *b1, struct buffer_head *b2)
-{
-       int speed;
-       unsigned long now;
-       int i, count, max;
-       struct buffer_head *bh_ptr[6];
-
-       func->next = xor_functions;
-       xor_functions = func;
-       bh_ptr[0] = b1;
-       bh_ptr[1] = b2;
-
-       /*
-        * count the number of XORs done during a whole jiffy.
-        * calculate the speed of checksumming from this.
-        * (we use a 2-page allocation to have guaranteed
-        * color L1-cache layout)
-        */
-       max = 0;
-       for (i = 0; i < 5; i++) {
-               now = jiffies;
-               count = 0;
-               while (jiffies == now) {
-                       mb();
-                       func->xor_block(2,bh_ptr);
-                       mb();
-                       count++;
-                       mb();
-               }
-               if (count > max)
-                       max = count;
-       }
-
-       speed = max * (HZ*SIZE/1024);
-       func->speed = speed;
-
-       printk( "   %-10s: %5d.%03d MB/sec\n", func->name,
-               speed / 1000, speed % 1000);
-}
-
-static inline void pick_fastest_function(void)
-{
-       struct xor_block_template *f, *fastest;
-
-       fastest = xor_functions;
-       for (f = fastest; f; f = f->next) {
-               if (f->speed > fastest->speed)
-                       fastest = f;
-       }
-#ifdef CONFIG_X86_XMM 
-       if (cpu_has_xmm) {
-               /* we force the use of the KNI xor block because it
-                       can write around l2.  we may also be able
-                       to load into the l1 only depending on how
-                       the cpu deals with a load to a line that is
-                       being prefetched.
-               */
-               fastest = &t_xor_block_pIII_kni;
-       }
-#endif
-#ifdef __alpha__
-       if (implver() == IMPLVER_EV6) {
-               /* Force the use of alpha_prefetch if EV6, as it
-                  is significantly faster in the cold cache case.  */
-               fastest = &t_xor_block_alpha_prefetch;
-       }
-#endif
-       xor_block = fastest->xor_block;
-       printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
-               fastest->speed / 1000, fastest->speed % 1000);
-}
-static struct buffer_head b1, b2;
-
-void calibrate_xor_block(void)
-{
-       if (xor_block)
-               return;
-       memset(&b1,0,sizeof(b1));
-       b2 = b1;
-
-       b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2);
-       if (!b1.b_data) {
-               pick_fastest_function();
-               return;
-       }
-       b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE;
-
-       b1.b_size = SIZE;
-
-       printk(KERN_INFO "raid5: measuring checksumming speed\n");
-
-       sti(); /* should be safe */
-
-#if defined(__sparc__) && !defined(__sparc_v9__)
-       printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n");
-       xor_speed(&t_xor_block_SPARC,&b1,&b2);
-#endif
-
-#ifdef CONFIG_X86_XMM 
-       if (cpu_has_xmm) {
-               printk(KERN_INFO
-                       "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n");
-               xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
-       }
-#endif /* CONFIG_X86_XMM */
-
-#ifdef __i386__
-       if (md_cpu_has_mmx()) {
-               printk(KERN_INFO
-                       "raid5: MMX detected, trying high-speed MMX checksum routines\n");
-               xor_speed(&t_xor_block_pII_mmx,&b1,&b2);
-               xor_speed(&t_xor_block_p5_mmx,&b1,&b2);
-       }
-#endif /* __i386__ */
-
-#ifdef __alpha__
-       xor_speed(&t_xor_block_alpha,&b1,&b2);
-       xor_speed(&t_xor_block_alpha_prefetch,&b1,&b2);
-#endif
-       
-       xor_speed(&t_xor_block_8regs,&b1,&b2);
-       xor_speed(&t_xor_block_32regs,&b1,&b2);
-
-       free_pages((unsigned long)b1.b_data,2);
-       pick_fastest_function();
-}
-
-#else /* __sparc_v9__ */
-
-void calibrate_xor_block(void)
-{
-       if (xor_block)
-               return;
-       printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n");
-       xor_block = xor_block_VIS;
-}
-
-#endif /* __sparc_v9__ */
-
-MD_EXPORT_SYMBOL(xor_block);
-MD_EXPORT_SYMBOL(calibrate_xor_block);
-
-#ifdef MODULE
-int init_module(void)
-{
-       calibrate_xor_block();
-       return 0;
-}
-#endif
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
+ *
+ * Dispatch optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+#include <asm/xor.h>
+
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;
+
+void
+xor_block(unsigned int count, struct buffer_head **bh_ptr)
+{
+       unsigned long *p0, *p1, *p2, *p3, *p4;
+       unsigned long bytes = bh_ptr[0]->b_size;
+
+       p0 = (unsigned long *) bh_ptr[0]->b_data;
+       p1 = (unsigned long *) bh_ptr[1]->b_data;
+       if (count == 2) {
+               active_template->do_2(bytes, p0, p1);
+               return;
+       }
+
+       p2 = (unsigned long *) bh_ptr[2]->b_data;
+       if (count == 3) {
+               active_template->do_3(bytes, p0, p1, p2);
+               return;
+       }
+
+       p3 = (unsigned long *) bh_ptr[3]->b_data;
+       if (count == 4) {
+               active_template->do_4(bytes, p0, p1, p2, p3);
+               return;
+       }
+
+       p4 = (unsigned long *) bh_ptr[4]->b_data;
+       active_template->do_5(bytes, p0, p1, p2, p3, p4);
+}
+
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;
+
+/* The -6*32 shift factor colors the cache.  */
+#define BENCH_SIZE (PAGE_SIZE-6*32)
+
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+{
+       int speed;
+       unsigned long now;
+       int i, count, max;
+
+       tmpl->next = template_list;
+       template_list = tmpl;
+
+       /*
+        * Count the number of XORs done during a whole jiffy, and use
+        * this to calculate the speed of checksumming.  We use a 2-page
+        * allocation to have guaranteed color L1-cache layout.
+        */
+       max = 0;
+       for (i = 0; i < 5; i++) {
+               now = jiffies;
+               count = 0;
+               while (jiffies == now) {
+                       mb();
+                       tmpl->do_2(BENCH_SIZE, b1, b2);
+                       mb();
+                       count++;
+                       mb();
+               }
+               if (count > max)
+                       max = count;
+       }
+
+       speed = max * (HZ * BENCH_SIZE / 1024);
+       tmpl->speed = speed;
+
+       printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+              speed / 1000, speed % 1000);
+}
+
+static int
+calibrate_xor_block(void)
+{
+       void *b1, *b2;
+       struct xor_block_template *f, *fastest;
+
+       b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
+       if (! b1) {
+               printk("raid5: Yikes!  No memory available.\n");
+               return -ENOMEM;
+       }
+       b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+
+       printk(KERN_INFO "raid5: measuring checksumming speed\n");
+       sti();
+
+#define xor_speed(templ)       do_xor_speed((templ), b1, b2)
+
+       XOR_TRY_TEMPLATES;
+
+#undef xor_speed
+
+       free_pages((unsigned long)b1, 2);
+
+       fastest = template_list;
+       for (f = fastest; f; f = f->next)
+               if (f->speed > fastest->speed)
+                       fastest = f;
+
+#ifdef XOR_SELECT_TEMPLATE
+       fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
+
+       active_template = fastest;
+       printk("raid5: using function: %s (%d.%03d MB/sec)\n",
+              fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+
+       return 0;
+}
+
+MD_EXPORT_SYMBOL(xor_block);
+
+module_init(calibrate_xor_block);
index ba79f94..245442f 100644 (file)
@@ -1129,7+1129,7 @@ static int nsc_ircc_hard_xmit_fir(struct sk_buff *skb, struct net_device *dev)
        if ((speed = irda_get_speed(skb)) != self->io.speed) {
                /* Check for empty frame */
                if (!skb->len) {
-                       nsc_ircc_change_speed_complete(self, speed); 
+                       nsc_ircc_change_speed(self, speed); 
                        return 0;
                } else
                        self->new_speed = speed;
index c9e6c1d..5c657b2 100644 (file)
@@ -207,8+207,10 @@ int __init a2091_detect(Scsi_Host_Template *tpnt)
            continue;
 
        instance = scsi_register (tpnt, sizeof (struct WD33C93_hostdata));
-       if(instance == NULL)
-               continue;
+       if (instance == NULL) {
+           release_mem_region(address, 256);
+           continue;
+       }
        instance->base = ZTWO_VADDR(address);
        instance->irq = IRQ_AMIGA_PORTS;
        instance->unique_id = z->slotaddr;
index 4e8d0c4..a509f6c 100644 (file)
@@ -66,8+66,8 @@ static __inline__ long atomic_add_return(int i, atomic_t * v)
        long temp, result;
        __asm__ __volatile__(
        "1:     ldl_l %0,%1\n"
+       "       addl %0,%3,%2\n"
        "       addl %0,%3,%0\n"
-       "       mov %0,%2\n"
        "       stl_c %0,%1\n"
        "       beq %0,2f\n"
        "       mb\n"
@@ -84,8+84,8 @@ static __inline__ long atomic_sub_return(int i, atomic_t * v)
        long temp, result;
        __asm__ __volatile__(
        "1:     ldl_l %0,%1\n"
+       "       subl %0,%3,%2\n"
        "       subl %0,%3,%0\n"
-       "       mov %0,%2\n"
        "       stl_c %0,%1\n"
        "       beq %0,2f\n"
        "       mb\n"
index 70d6ce7..7714bf2 100644 (file)
   __asm__("stw %1,%0" : "=m"(mem) : "r"(val))
 #endif
 
+/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
+   a mechanism by which the user can annotate likely branch directions and
+   expect the blocks to be reordered appropriately.  Define __builtin_expect
+   to nothing for earlier compilers.  */
+
+#if __GNUC__ == 2 && __GNUC_MINOR__ < 96
+#define __builtin_expect(x, expected_value) (x)
+#endif
+
 #endif /* __ALPHA_COMPILER_H */
diff --git a/include/asm-alpha/semaphore-helper.h b/include/asm-alpha/semaphore-helper.h
deleted file mode 100644 (file)
index 52d8fb5..0000000
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef _ALPHA_SEMAPHORE_HELPER_H
-#define _ALPHA_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1999 Richard Henderson
- */
-
-/*
- * These two _must_ execute atomically wrt each other.
- *
- * This is trivially done with load_locked/store_cond,
- * which we have.  Let the rest of the losers suck eggs.
- */
-
-static inline void
-wake_one_more(struct semaphore * sem)
-{
-       atomic_inc(&sem->waking);
-}
-
-static inline int
-waking_non_zero(struct semaphore *sem)
-{
-       long ret, tmp;
-
-       /* An atomic conditional decrement.  */
-       __asm__ __volatile__(
-               "1:     ldl_l   %1,%2\n"
-               "       blt     %1,2f\n"
-               "       subl    %1,1,%0\n"
-               "       stl_c   %0,%2\n"
-               "       beq     %0,3f\n"
-               "2:\n"
-               ".subsection 2\n"
-               "3:     br      1b\n"
-               ".previous"
-               : "=r"(ret), "=r"(tmp), "=m"(sem->waking.counter)
-               : "0"(0));
-
-       return ret > 0;
-}
-
-
-/*
- * waking_non_zero_interruptible:
- *     1       got the lock
- *     0       go to sleep
- *     -EINTR  interrupted
- *
- * We must undo the sem->count down_interruptible decrement
- * simultaneously and atomicly with the sem->waking adjustment,
- * otherwise we can race with wake_one_more.
- *
- * This is accomplished by doing a 64-bit ll/sc on the 2 32-bit words.
- */
-
-static inline int
-waking_non_zero_interruptible(struct semaphore *sem, struct task_struct *tsk)
-{
-       long ret, tmp, tmp2, tmp3;
-
-       /* "Equivalent" C.  Note that we have to do this all without
-          (taken) branches in order to be a valid ll/sc sequence.
-
-          do {
-              tmp = ldq_l;
-              ret = 0;
-              if (tmp >= 0) {
-                  tmp += 0xffffffff00000000;
-                  ret = 1;
-              }
-              else if (pending) {
-                  // Since -1 + 1 carries into the high word, we have
-                  // to be more careful adding 1 here.
-                  tmp = (tmp & 0xffffffff00000000)
-                        | ((tmp + 1) & 0x00000000ffffffff;
-                  ret = -EINTR;
-              }
-              else {
-                  break;       // ideally.  we don't actually break 
-                               // since this is a predicate we don't
-                               // have, and is more trouble to build
-                               // than to elide the noop stq_c.
-              }
-              tmp = stq_c = tmp;
-          } while (tmp == 0);
-       */
-
-       __asm__ __volatile__(
-               "1:     ldq_l   %1,%4\n"
-               "       lda     %0,0\n"
-               "       cmovne  %5,%6,%0\n"
-               "       addq    %1,1,%2\n"
-               "       and     %1,%7,%3\n"
-               "       andnot  %2,%7,%2\n"
-               "       cmovge  %1,1,%0\n"
-               "       or      %3,%2,%2\n"
-               "       addq    %1,%7,%3\n"
-               "       cmovne  %5,%2,%1\n"
-               "       cmovge  %2,%3,%1\n"
-               "       stq_c   %1,%4\n"
-               "       beq     %1,3f\n"
-               "2:\n"
-               ".subsection 2\n"
-               "3:     br      1b\n"
-               ".previous"
-               : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3), "=m"(*sem)
-               : "r"(signal_pending(tsk)), "r"(-EINTR),
-                 "r"(0xffffffff00000000));
-
-       return ret;
-}
-
-/*
- * waking_non_zero_trylock is unused.  we do everything in 
- * down_trylock and let non-ll/sc hosts bounce around.
- */
-
-static inline int
-waking_non_zero_trylock(struct semaphore *sem)
-{
-       return 0;
-}
-
-#endif
dissimilarity index 65%
index 38bc05c..1aedb34 100644 (file)
-#ifndef _ALPHA_SEMAPHORE_H
-#define _ALPHA_SEMAPHORE_H
-
-/*
- * SMP- and interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1996, 2000 Richard Henderson
- */
-
-#include <asm/current.h>
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-struct semaphore {
-       /* Careful, inline assembly knows about the position of these two.  */
-       atomic_t count;
-       atomic_t waking;                /* biased by -1 */
-       wait_queue_head_t wait;
-#if WAITQUEUE_DEBUG
-       long __magic;
-#endif
-};
-
-#if WAITQUEUE_DEBUG
-# define __SEM_DEBUG_INIT(name)                , (long)&(name).__magic
-#else
-# define __SEM_DEBUG_INIT(name)
-#endif
-
-#define __SEMAPHORE_INITIALIZER(name,count)            \
-       { ATOMIC_INIT(count), ATOMIC_INIT(-1),          \
-         __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)    \
-         __SEM_DEBUG_INIT(name) }
-
-#define __MUTEX_INITIALIZER(name) \
-       __SEMAPHORE_INITIALIZER(name,1)
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-       struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
-
-extern inline void sema_init(struct semaphore *sem, int val)
-{
-       /*
-        * Logically, 
-        *   *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
-        * except that gcc produces better initializing by parts yet.
-        */
-
-       atomic_set(&sem->count, val);
-       atomic_set(&sem->waking, -1);
-       init_waitqueue_head(&sem->wait);
-#if WAITQUEUE_DEBUG
-       sem->__magic = (long)&sem->__magic;
-#endif
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-       sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-       sema_init(sem, 0);
-}
-
-
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-/* All have custom assembly linkages.  */
-extern void __down_failed(struct semaphore * sem);
-extern void __down_failed_interruptible(struct semaphore * sem);
-extern void __down_failed_trylock(struct semaphore * sem);
-extern void __up_wakeup(struct semaphore * sem);
-
-/*
- * Whee.  Hidden out of line code is fun.  The contention cases are
- * handled out of line in kernel/sched.c; arch/alpha/lib/semaphore.S
- * takes care of making sure we can call it without clobbering regs.
- */
-
-extern inline void down(struct semaphore * sem)
-{
-       /* Given that we have to use particular hard registers to 
-          communicate with __down_failed anyway, reuse them in 
-          the atomic operation as well. 
-
-          __down_failed takes the semaphore address in $24, and
-          it's return address in $28.  The pv is loaded as usual.
-          The gp is clobbered (in the module case) as usual.  */
-
-       /* This little bit of silliness is to get the GP loaded for
-          a function that ordinarily wouldn't.  Otherwise we could
-          have it done by the macro directly, which can be optimized
-          the linker.  */
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-#endif
-       
-       pv = __down_failed;
-       __asm__ __volatile__ (
-               "/* semaphore down operation */\n"
-               "1:     ldl_l   $24,%1\n"
-               "       subl    $24,1,$28\n"
-               "       subl    $24,1,$24\n"
-               "       stl_c   $28,%1\n"
-               "       beq     $28,2f\n"
-               "       blt     $24,3f\n"
-               "4:     mb\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     lda     $24,%1\n"
-               "       jsr     $28,($27),__down_failed\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(pv)
-               : "m"(sem->count), "r"(pv)
-               : "$24", "$28", "memory");
-}
-
-extern inline int down_interruptible(struct semaphore * sem)
-{
-       /* __down_failed_interruptible takes the semaphore address in $24,
-          and it's return address in $28.  The pv is loaded as usual.
-          The gp is clobbered (in the module case) as usual.  The return
-          value is in $24.  */
-
-       register int ret __asm__("$24");
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-#endif
-       
-       pv = __down_failed_interruptible;
-       __asm__ __volatile__ (
-               "/* semaphore down interruptible operation */\n"
-               "1:     ldl_l   $24,%2\n"
-               "       subl    $24,1,$28\n"
-               "       subl    $24,1,$24\n"
-               "       stl_c   $28,%2\n"
-               "       beq     $28,2f\n"
-               "       blt     $24,3f\n"
-               "       mov     $31,%0\n"
-               "4:     mb\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     lda     $24,%2\n"
-               "       jsr     $28,($27),__down_failed_interruptible\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(ret), "=r"(pv)
-               : "m"(sem->count), "r"(pv)
-               : "$28", "memory");
-
-       return ret;
-}
-
-/*
- * down_trylock returns 0 on success, 1 if we failed to get the lock.
- *
- * We must manipulate count and waking simultaneously and atomically.
- * Do this by using ll/sc on the pair of 32-bit words.
- */
-
-extern inline int down_trylock(struct semaphore * sem)
-{
-       long ret, tmp, tmp2, sub;
-
-       /* "Equivalent" C.  Note that we have to do this all without
-          (taken) branches in order to be a valid ll/sc sequence.
-
-          do {
-              tmp = ldq_l;
-              sub = 0x0000000100000000;
-              ret = ((int)tmp <= 0);           // count =< 0 ?
-              if ((int)tmp >= 0) sub = 0;      // count >= 0 ?
-                       // note that if count=0 subq overflows to the high
-                       // longword (i.e waking)
-              ret &= ((long)tmp < 0);          // waking < 0 ?
-              sub += 1;
-              if (ret) 
-                       break;  
-              tmp -= sub;
-              tmp = stq_c = tmp;
-          } while (tmp == 0);
-       */
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-#endif
-       
-       __asm__ __volatile__(
-               "1:     ldq_l   %1,%4\n"
-               "       lda     %3,1\n"
-               "       addl    %1,0,%2\n"
-               "       sll     %3,32,%3\n"
-               "       cmple   %2,0,%0\n"
-               "       cmovge  %2,0,%3\n"
-               "       cmplt   %1,0,%2\n"
-               "       addq    %3,1,%3\n"
-               "       and     %0,%2,%0\n"
-               "       bne     %0,2f\n"
-               "       subq    %1,%3,%1\n"
-               "       stq_c   %1,%4\n"
-               "       beq     %1,3f\n"
-               "2:\n"
-               ".subsection 2\n"
-               "3:     br      1b\n"
-               ".previous"
-               : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(sub)
-               : "m"(*sem)
-               : "memory");
-
-       return ret;
-}
-
-extern inline void up(struct semaphore * sem)
-{
-       /* Given that we have to use particular hard registers to 
-          communicate with __up_wakeup anyway, reuse them in 
-          the atomic operation as well. 
-
-          __up_wakeup takes the semaphore address in $24, and
-          it's return address in $28.  The pv is loaded as usual.
-          The gp is clobbered (in the module case) as usual.  */
-
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-#endif
-       
-       pv = __up_wakeup;
-       __asm__ __volatile__ (
-               "/* semaphore up operation */\n"
-               "       mb\n"
-               "1:     ldl_l   $24,%1\n"
-               "       addl    $24,1,$28\n"
-               "       addl    $24,1,$24\n"
-               "       stl_c   $28,%1\n"
-               "       beq     $28,2f\n"
-               "       ble     $24,3f\n"
-               "4:\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     lda     $24,%1\n"
-               "       jsr     $28,($27),__up_wakeup\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(pv)
-               : "m"(sem->count), "r"(pv)
-               : "$24", "$28", "memory");
-}
-
-
-/* rw mutexes (should that be mutices? =) -- throw rw
- * spinlocks and semaphores together, and this is what we
- * end up with...
- *
- * The lock is initialized to BIAS.  This way, a writer
- * subtracts BIAS ands gets 0 for the case of an uncontended
- * lock.  Readers decrement by 1 and see a positive value
- * when uncontended, negative if there are writers waiting
- * (in which case it goes to sleep).
- *
- * The value 0x01000000 supports up to 128 processors and
- * lots of processes.  BIAS must be chosen such that subtracting
- * BIAS once per CPU will result in the int remaining
- * negative.
- * In terms of fairness, this should result in the lock
- * flopping back and forth between readers and writers
- * under heavy use.
- *
- *           -ben
- *
- * Once we start supporting machines with more than 128 CPUs,
- * we should go for using a 64bit atomic type instead of 32bit
- * as counter. We shall probably go for bias 0x80000000 then,
- * so that single sethi can set it.
- *
- *           -jj
- */
-
-#define RW_LOCK_BIAS           0x01000000
-
-struct rw_semaphore {
-       int                     count;
-       /* bit 0 means read bias granted;
-          bit 1 means write bias granted.  */
-       unsigned                granted;
-       wait_queue_head_t       wait;
-       wait_queue_head_t       write_bias_wait;
-#if WAITQUEUE_DEBUG
-       long                    __magic;
-       atomic_t                readers;
-       atomic_t                writers;
-#endif
-};
-
-#if WAITQUEUE_DEBUG
-#define __RWSEM_DEBUG_INIT     , ATOMIC_INIT(0), ATOMIC_INIT(0)
-#else
-#define __RWSEM_DEBUG_INIT     /* */
-#endif
-
-#define __RWSEM_INITIALIZER(name,count)                                        \
-       { (count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),       \
-         __WAIT_QUEUE_HEAD_INITIALIZER((name).write_bias_wait)         \
-         __SEM_DEBUG_INIT(name) __RWSEM_DEBUG_INIT }
-
-#define __DECLARE_RWSEM_GENERIC(name,count) \
-       struct rw_semaphore name = __RWSEM_INITIALIZER(name,count)
-
-#define DECLARE_RWSEM(name) \
-       __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS)
-#define DECLARE_RWSEM_READ_LOCKED(name) \
-       __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS-1)
-#define DECLARE_RWSEM_WRITE_LOCKED(name) \
-       __DECLARE_RWSEM_GENERIC(name, 0)
-
-extern inline void init_rwsem(struct rw_semaphore *sem)
-{
-       sem->count = RW_LOCK_BIAS;
-       sem->granted = 0;
-       init_waitqueue_head(&sem->wait);
-       init_waitqueue_head(&sem->write_bias_wait);
-#if WAITQUEUE_DEBUG
-       sem->__magic = (long)&sem->__magic;
-       atomic_set(&sem->readers, 0);
-       atomic_set(&sem->writers, 0);
-#endif
-}
-
-/* All have custom assembly linkages.  */
-extern void __down_read_failed(struct rw_semaphore *sem);
-extern void __down_write_failed(struct rw_semaphore *sem);
-extern void __rwsem_wake(struct rw_semaphore *sem, unsigned long readers);
-
-extern inline void down_read(struct rw_semaphore *sem)
-{
-       /* Given that we have to use particular hard registers to 
-          communicate with __down_read_failed anyway, reuse them in 
-          the atomic operation as well. 
-
-          __down_read_failed takes the semaphore address in $24, the count
-          we read in $25, and it's return address in $28. The pv is loaded
-          as usual. The gp is clobbered (in the module case) as usual.  */
-
-       /* This little bit of silliness is to get the GP loaded for
-          a function that ordinarily wouldn't.  Otherwise we could
-          have it done by the macro directly, which can be optimized
-          the linker.  */
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-#endif
-
-       pv = __down_read_failed;
-       __asm__ __volatile__(
-               "/* semaphore down_read operation */\n"
-               "1:     ldl_l   $24,%1\n"
-               "       subl    $24,1,$28\n"
-               "       subl    $24,1,$25\n"
-               "       stl_c   $28,%1\n"
-               "       beq     $28,2f\n"
-               "       blt     $25,3f\n"
-               "4:     mb\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     lda     $24,%1\n"
-               "       jsr     $28,($27),__down_read_failed\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(pv)
-               : "m"(sem->count), "r"(pv)
-               : "$24", "$25", "$28", "memory");
-
-#if WAITQUEUE_DEBUG
-       if (sem->granted & 2)
-               BUG();
-       if (atomic_read(&sem->writers))
-               BUG();
-       atomic_inc(&sem->readers);
-#endif
-}
-
-extern inline void down_write(struct rw_semaphore *sem)
-{
-       /* Given that we have to use particular hard registers to 
-          communicate with __down_write_failed anyway, reuse them in 
-          the atomic operation as well. 
-
-          __down_write_failed takes the semaphore address in $24, the count
-          we read in $25, and it's return address in $28. The pv is loaded
-          as usual. The gp is clobbered (in the module case) as usual.  */
-
-       /* This little bit of silliness is to get the GP loaded for
-          a function that ordinarily wouldn't.  Otherwise we could
-          have it done by the macro directly, which can be optimized
-          the linker.  */
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-#endif
-
-       pv = __down_write_failed;
-       __asm__ __volatile__(
-               "/* semaphore down_write operation */\n"
-               "1:     ldl_l   $24,%1\n"
-               "       ldah    $28,%3($24)\n"
-               "       ldah    $25,%3($24)\n"
-               "       stl_c   $28,%1\n"
-               "       beq     $28,2f\n"
-               "       bne     $25,3f\n"
-               "4:     mb\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     lda     $24,%1\n"
-               "       jsr     $28,($27),__down_write_failed\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(pv)
-               : "m"(sem->count), "r"(pv), "i"(-(RW_LOCK_BIAS >> 16))
-               : "$24", "$25", "$28", "memory");
-
-#if WAITQUEUE_DEBUG
-       if (atomic_read(&sem->writers))
-               BUG();
-       if (atomic_read(&sem->readers))
-               BUG();
-       if (sem->granted & 3)
-               BUG();
-       atomic_inc(&sem->writers);
-#endif
-}
-
-/* When a reader does a release, the only significant case is when
-  there was a writer waiting, and we've * bumped the count to 0: we must
-wake the writer up.  */
-
-extern inline void up_read(struct rw_semaphore *sem)
-{
-       /* Given that we have to use particular hard registers to 
-          communicate with __rwsem_wake anyway, reuse them in 
-          the atomic operation as well. 
-
-          __rwsem_wake takes the semaphore address in $24, the
-          number of waiting readers in $25, and it's return address
-          in $28.  The pv is loaded as usual. The gp is clobbered
-          (in the module case) as usual.  */
-
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-       if (sem->granted & 2)
-               BUG();
-       if (atomic_read(&sem->writers))
-               BUG();
-       atomic_dec(&sem->readers);
-#endif
-
-       pv = __rwsem_wake;
-       __asm__ __volatile__(
-               "/* semaphore up_read operation */\n"
-               "       mb\n"
-               "1:     ldl_l   $24,%1\n"
-               "       addl    $24,1,$28\n"
-               "       addl    $24,1,$24\n"
-               "       stl_c   $28,%1\n"
-               "       beq     $28,2f\n"
-               "       beq     $24,3f\n"
-               "4:\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     lda     $24,%1\n"
-               "       mov     0,$25\n"
-               "       jsr     $28,($27),__rwsem_wake\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(pv)
-               : "m"(sem->count), "r"(pv)
-               : "$24", "$25", "$28", "memory");
-}
-
-/* releasing the writer is easy -- just release it and
- * wake up any sleepers.
- */
-extern inline void up_write(struct rw_semaphore *sem)
-{
-       /* Given that we have to use particular hard registers to 
-          communicate with __rwsem_wake anyway, reuse them in 
-          the atomic operation as well. 
-
-          __rwsem_wake takes the semaphore address in $24, the
-          number of waiting readers in $25, and it's return address
-          in $28.  The pv is loaded as usual. The gp is clobbered
-          (in the module case) as usual.  */
-
-       register void *pv __asm__("$27");
-
-#if WAITQUEUE_DEBUG
-       CHECK_MAGIC(sem->__magic);
-       if (sem->granted & 3)
-               BUG();
-       if (atomic_read(&sem->readers))
-               BUG();
-       if (atomic_read(&sem->writers) != 1)
-               BUG();
-       atomic_dec(&sem->writers);
-#endif
-
-       pv = __rwsem_wake;
-       __asm__ __volatile__(
-               "/* semaphore up_write operation */\n"
-               "       mb\n"
-               "1:     ldl_l   $24,%1\n"
-               "       ldah    $28,%3($24)\n"
-               "       stl_c   $28,%1\n"
-               "       beq     $28,2f\n"
-               "       blt     $24,3f\n"
-               "4:\n"
-               ".subsection 2\n"
-               "2:     br      1b\n"
-               "3:     ldah    $25,%3($24)\n"
-               /* Only do the wake if we're no longer negative.  */
-               "       blt     $25,4b\n"
-               "       lda     $24,%1\n"
-               "       jsr     $28,($27),__rwsem_wake\n"
-               "       ldgp    $29,0($28)\n"
-               "       br      4b\n"
-               ".previous"
-               : "=r"(pv)
-               : "m"(sem->count), "r"(pv), "i"(RW_LOCK_BIAS >> 16)
-               : "$24", "$25", "$28", "memory");
-}
-
-#endif
+#ifndef _ALPHA_SEMAPHORE_H
+#define _ALPHA_SEMAPHORE_H
+
+/*
+ * SMP- and interrupt-safe semaphores..
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ * (C) Copyright 1996, 2000 Richard Henderson
+ */
+
+#include <asm/current.h>
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+#define DEBUG_SEMAPHORE 0
+#define DEBUG_RW_SEMAPHORE 0
+
+struct semaphore {
+       /* Careful, inline assembly knows about the position of these two.  */
+       atomic_t count __attribute__((aligned(8)));
+       atomic_t waking;                /* biased by -1 */
+
+       wait_queue_head_t wait;
+#if WAITQUEUE_DEBUG
+       long __magic;
+#endif
+};
+
+#if WAITQUEUE_DEBUG
+# define __SEM_DEBUG_INIT(name)                , (long)&(name).__magic
+#else
+# define __SEM_DEBUG_INIT(name)
+#endif
+
+#define __SEMAPHORE_INITIALIZER(name,count)            \
+       { ATOMIC_INIT(count), ATOMIC_INIT(-1),          \
+         __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)    \
+         __SEM_DEBUG_INIT(name) }
+
+#define __MUTEX_INITIALIZER(name) \
+       __SEMAPHORE_INITIALIZER(name,1)
+
+#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
+       struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+
+#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
+#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+
+static inline void sema_init(struct semaphore *sem, int val)
+{
+       /*
+        * Logically, 
+        *   *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+        * except that gcc produces better initializing by parts yet.
+        */
+
+       atomic_set(&sem->count, val);
+       atomic_set(&sem->waking, -1);
+       init_waitqueue_head(&sem->wait);
+#if WAITQUEUE_DEBUG
+       sem->__magic = (long)&sem->__magic;
+#endif
+}
+
+static inline void init_MUTEX (struct semaphore *sem)
+{
+       sema_init(sem, 1);
+}
+
+static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+{
+       sema_init(sem, 0);
+}
+
+extern void down(struct semaphore *);
+extern void __down_failed(struct semaphore *);
+extern int  down_interruptible(struct semaphore *);
+extern int  __down_failed_interruptible(struct semaphore *);
+extern int  down_trylock(struct semaphore *);
+extern void up(struct semaphore *);
+extern void __up_wakeup(struct semaphore *);
+
+/*
+ * Hidden out of line code is fun, but extremely messy.  Rely on newer
+ * compilers to do a respectable job with this.  The contention cases
+ * are handled out of line in arch/alpha/kernel/semaphore.c.
+ */
+
+static inline void __down(struct semaphore *sem)
+{
+       long count = atomic_dec_return(&sem->count);
+       if (__builtin_expect(count < 0, 0))
+               __down_failed(sem);
+}
+
+static inline int __down_interruptible(struct semaphore *sem)
+{
+       long count = atomic_dec_return(&sem->count);
+       if (__builtin_expect(count < 0, 0))
+               return __down_failed_interruptible(sem);
+       return 0;
+}
+
+/*
+ * down_trylock returns 0 on success, 1 if we failed to get the lock.
+ *
+ * We must manipulate count and waking simultaneously and atomically.
+ * Do this by using ll/sc on the pair of 32-bit words.
+ */
+
+static inline int __down_trylock(struct semaphore * sem)
+{
+       long ret, tmp, tmp2, sub;
+
+       /* "Equivalent" C.  Note that we have to do this all without
+          (taken) branches in order to be a valid ll/sc sequence.
+
+          do {
+               tmp = ldq_l;
+               sub = 0x0000000100000000;       
+               ret = ((int)tmp <= 0);          // count <= 0 ?
+               // Note that if count=0, the decrement overflows into
+               // waking, so cancel the 1 loaded above.  Also cancel
+               // it if the lock was already free.
+               if ((int)tmp >= 0) sub = 0;     // count >= 0 ?
+               ret &= ((long)tmp < 0);         // waking < 0 ?
+               sub += 1;
+               if (ret) break; 
+               tmp -= sub;
+               tmp = stq_c = tmp;
+          } while (tmp == 0);
+       */
+
+       __asm__ __volatile__(
+               "1:     ldq_l   %1,%4\n"
+               "       lda     %3,1\n"
+               "       addl    %1,0,%2\n"
+               "       sll     %3,32,%3\n"
+               "       cmple   %2,0,%0\n"
+               "       cmovge  %2,0,%3\n"
+               "       cmplt   %1,0,%2\n"
+               "       addq    %3,1,%3\n"
+               "       and     %0,%2,%0\n"
+               "       bne     %0,2f\n"
+               "       subq    %1,%3,%1\n"
+               "       stq_c   %1,%4\n"
+               "       beq     %1,3f\n"
+               "2:     mb\n"
+               ".subsection 2\n"
+               "3:     br      1b\n"
+               ".previous"
+               : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(sub)
+               : "m"(*sem)
+               : "memory");
+
+       return ret;
+}
+
+static inline void __up(struct semaphore *sem)
+{
+       long ret, tmp, tmp2, tmp3;
+
+       /* We must manipulate count and waking simultaneously and atomically.
+          Otherwise we have races between up and __down_failed_interruptible
+          waking up on a signal.
+
+          "Equivalent" C.  Note that we have to do this all without
+          (taken) branches in order to be a valid ll/sc sequence.
+
+          do {
+               tmp = ldq_l;
+               ret = (int)tmp + 1;                     // count += 1;
+               tmp2 = tmp & 0xffffffff00000000;        // extract waking
+               if (ret <= 0)                           // still sleepers?
+                       tmp2 += 0x0000000100000000;     // waking += 1;
+               tmp = ret & 0x00000000ffffffff;         // insert count
+               tmp |= tmp2;                            // insert waking;
+              tmp = stq_c = tmp;
+          } while (tmp == 0);
+       */
+
+       __asm__ __volatile__(
+               "       mb\n"
+               "1:     ldq_l   %1,%4\n"
+               "       addl    %1,1,%0\n"
+               "       zapnot  %1,0xf0,%2\n"
+               "       addq    %2,%5,%3\n"
+               "       cmovle  %0,%3,%2\n"
+               "       zapnot  %0,0x0f,%1\n"
+               "       bis     %1,%2,%1\n"
+               "       stq_c   %1,%4\n"
+               "       beq     %1,3f\n"
+               "2:\n"
+               ".subsection 2\n"
+               "3:     br      1b\n"
+               ".previous"
+               : "=&r"(ret), "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
+               : "m"(*sem), "r"(0x0000000100000000)
+               : "memory");
+
+       if (__builtin_expect(ret <= 0, 0))
+               __up_wakeup(sem);
+}
+
+#if !WAITQUEUE_DEBUG && !DEBUG_SEMAPHORE
+extern inline void down(struct semaphore *sem)
+{
+       __down(sem);
+}
+extern inline int down_interruptible(struct semaphore *sem)
+{
+       return __down_interruptible(sem);
+}
+extern inline int down_trylock(struct semaphore *sem)
+{
+       return __down_trylock(sem);
+}
+extern inline void up(struct semaphore *sem)
+{
+       __up(sem);
+}
+#endif
+
+/* rw mutexes (should that be mutices? =) -- throw rw
+ * spinlocks and semaphores together, and this is what we
+ * end up with...
+ *
+ * The lock is initialized to BIAS.  This way, a writer
+ * subtracts BIAS ands gets 0 for the case of an uncontended
+ * lock.  Readers decrement by 1 and see a positive value
+ * when uncontended, negative if there are writers waiting
+ * (in which case it goes to sleep).
+ *
+ * The value 0x01000000 supports up to 128 processors and
+ * lots of processes.  BIAS must be chosen such that subtracting
+ * BIAS once per CPU will result in the int remaining
+ * negative.
+ * In terms of fairness, this should result in the lock
+ * flopping back and forth between readers and writers
+ * under heavy use.
+ *
+ *           -ben
+ *
+ * Once we start supporting machines with more than 128 CPUs,
+ * we should go for using a 64bit atomic type instead of 32bit
+ * as counter. We shall probably go for bias 0x80000000 then,
+ * so that single sethi can set it.
+ *
+ *           -jj
+ */
+
+#define RW_LOCK_BIAS           0x01000000
+
+struct rw_semaphore {
+       atomic_t                count;
+       /* bit 0 means read bias granted;
+          bit 1 means write bias granted.  */
+       unsigned                granted;
+       wait_queue_head_t       wait;
+       wait_queue_head_t       write_bias_wait;
+#if WAITQUEUE_DEBUG
+       long                    __magic;
+       atomic_t                readers;
+       atomic_t                writers;
+#endif
+};
+
+#if WAITQUEUE_DEBUG
+#define __RWSEM_DEBUG_INIT     , ATOMIC_INIT(0), ATOMIC_INIT(0)
+#else
+#define __RWSEM_DEBUG_INIT     /* */
+#endif
+
+#define __RWSEM_INITIALIZER(name,count)                                        \
+       { ATOMIC_INIT(count), 0, __WAIT_QUEUE_HEAD_INITIALIZER((name).wait), \
+         __WAIT_QUEUE_HEAD_INITIALIZER((name).write_bias_wait)         \
+         __SEM_DEBUG_INIT(name) __RWSEM_DEBUG_INIT }
+
+#define __DECLARE_RWSEM_GENERIC(name,count) \
+       struct rw_semaphore name = __RWSEM_INITIALIZER(name,count)
+
+#define DECLARE_RWSEM(name) \
+       __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS)
+#define DECLARE_RWSEM_READ_LOCKED(name) \
+       __DECLARE_RWSEM_GENERIC(name, RW_LOCK_BIAS-1)
+#define DECLARE_RWSEM_WRITE_LOCKED(name) \
+       __DECLARE_RWSEM_GENERIC(name, 0)
+
+static inline void init_rwsem(struct rw_semaphore *sem)
+{
+       atomic_set (&sem->count, RW_LOCK_BIAS);
+       sem->granted = 0;
+       init_waitqueue_head(&sem->wait);
+       init_waitqueue_head(&sem->write_bias_wait);
+#if WAITQUEUE_DEBUG
+       sem->__magic = (long)&sem->__magic;
+       atomic_set(&sem->readers, 0);
+       atomic_set(&sem->writers, 0);
+#endif
+}
+
+extern void down_read(struct rw_semaphore *);
+extern void down_write(struct rw_semaphore *);
+extern void up_read(struct rw_semaphore *);
+extern void up_write(struct rw_semaphore *);
+extern void __down_read_failed(struct rw_semaphore *, int);
+extern void __down_write_failed(struct rw_semaphore *, int);
+extern void __rwsem_wake(struct rw_semaphore *, int);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+       long count = atomic_dec_return(&sem->count);
+       if (__builtin_expect(count < 0, 0))
+               __down_read_failed(sem, count);
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+       long count = atomic_sub_return(RW_LOCK_BIAS, &sem->count);
+       if (__builtin_expect(count != 0, 0))
+               __down_write_failed(sem, count);
+}
+
+/* When a reader does a release, the only significant case is when there
+   was a writer waiting, and we've bumped the count to 0, then we must
+   wake the writer up.  */
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+       long count;
+       mb();
+       count = atomic_inc_return(&sem->count);
+       if (__builtin_expect(count == 0, 0))
+               __rwsem_wake(sem, 0);
+}
+
+/* Releasing the writer is easy -- just release it and wake up
+   any sleepers.  */
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+       long count, wake;
+       mb();
+       count = atomic_add_return(RW_LOCK_BIAS, &sem->count);
+
+       /* Only do the wake if we were, but are no longer, negative.  */
+       wake = ((int)(count - RW_LOCK_BIAS) < 0) && count >= 0;
+       if (__builtin_expect(wake, 0))
+               __rwsem_wake(sem, count);
+}
+
+#if !WAITQUEUE_DEBUG && !DEBUG_RW_SEMAPHORE
+extern inline void down_read(struct rw_semaphore *sem)
+{
+       __down_read(sem);
+}
+extern inline void down_write(struct rw_semaphore *sem)
+{
+       __down_write(sem);
+}
+extern inline void up_read(struct rw_semaphore *sem)
+{
+       __up_read(sem);
+}
+extern inline void up_write(struct rw_semaphore *sem)
+{
+       __up_write(sem);
+}
+#endif
+
+#endif
index 64e05d1..e1d8098 100644 (file)
@@ -80,7+80,7 @@ static inline void spin_lock(spinlock_t * lock)
        "       blbs    %0,2b\n"
        "       br      1b\n"
        ".previous"
-       : "=r" (tmp), "=m" (lock->lock)
+       : "=&r" (tmp), "=m" (lock->lock)
        : "m"(lock->lock) : "memory");
 }
 
diff --git a/include/asm-alpha/xor.h b/include/asm-alpha/xor.h
new file mode 100644 (file)
index 0000000..e11477f
--- /dev/null
@@ -0,0 +1,855 @@
+/*
+ * include/asm-alpha/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for alpha EV5 and EV6
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
+                       unsigned long *);
+extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
+                       unsigned long *, unsigned long *);
+extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
+                       unsigned long *, unsigned long *, unsigned long *);
+
+extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
+                                unsigned long *);
+extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
+                                unsigned long *, unsigned long *);
+extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
+                                unsigned long *, unsigned long *,
+                                unsigned long *);
+extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
+                                unsigned long *, unsigned long *,
+                                unsigned long *, unsigned long *);
+
+asm("
+       .text
+       .align 3
+       .ent xor_alpha_2
+xor_alpha_2:
+       .prologue 0
+       srl $16, 6, $16
+       .align 4
+2:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,8($17)
+       ldq $3,8($18)
+
+       ldq $4,16($17)
+       ldq $5,16($18)
+       ldq $6,24($17)
+       ldq $7,24($18)
+
+       ldq $19,32($17)
+       ldq $20,32($18)
+       ldq $21,40($17)
+       ldq $22,40($18)
+
+       ldq $23,48($17)
+       ldq $24,48($18)
+       ldq $25,56($17)
+       xor $0,$1,$0            # 7 cycles from $1 load
+
+       ldq $27,56($18)
+       xor $2,$3,$2
+       stq $0,0($17)
+       xor $4,$5,$4
+
+       stq $2,8($17)
+       xor $6,$7,$6
+       stq $4,16($17)
+       xor $19,$20,$19
+
+       stq $6,24($17)
+       xor $21,$22,$21
+       stq $19,32($17)
+       xor $23,$24,$23
+
+       stq $21,40($17)
+       xor $25,$27,$25
+       stq $23,48($17)
+       subq $16,1,$16
+
+       stq $25,56($17)
+       addq $17,64,$17
+       addq $18,64,$18
+       bgt $16,2b
+
+       ret
+       .end xor_alpha_2
+
+       .align 3
+       .ent xor_alpha_3
+xor_alpha_3:
+       .prologue 0
+       srl $16, 6, $16
+       .align 4
+3:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,0($19)
+       ldq $3,8($17)
+
+       ldq $4,8($18)
+       ldq $6,16($17)
+       ldq $7,16($18)
+       ldq $21,24($17)
+
+       ldq $22,24($18)
+       ldq $24,32($17)
+       ldq $25,32($18)
+       ldq $5,8($19)
+
+       ldq $20,16($19)
+       ldq $23,24($19)
+       ldq $27,32($19)
+       nop
+
+       xor $0,$1,$1            # 8 cycles from $0 load
+       xor $3,$4,$4            # 6 cycles from $4 load
+       xor $6,$7,$7            # 6 cycles from $7 load
+       xor $21,$22,$22         # 5 cycles from $22 load
+
+       xor $1,$2,$2            # 9 cycles from $2 load
+       xor $24,$25,$25         # 5 cycles from $25 load
+       stq $2,0($17)
+       xor $4,$5,$5            # 6 cycles from $5 load
+
+       stq $5,8($17)
+       xor $7,$20,$20          # 7 cycles from $20 load
+       stq $20,16($17)
+       xor $22,$23,$23         # 7 cycles from $23 load
+
+       stq $23,24($17)
+       xor $25,$27,$27         # 7 cycles from $27 load
+       stq $27,32($17)
+       nop
+
+       ldq $0,40($17)
+       ldq $1,40($18)
+       ldq $3,48($17)
+       ldq $4,48($18)
+
+       ldq $6,56($17)
+       ldq $7,56($18)
+       ldq $2,40($19)
+       ldq $5,48($19)
+
+       ldq $20,56($19)
+       xor $0,$1,$1            # 4 cycles from $1 load
+       xor $3,$4,$4            # 5 cycles from $4 load
+       xor $6,$7,$7            # 5 cycles from $7 load
+
+       xor $1,$2,$2            # 4 cycles from $2 load
+       xor $4,$5,$5            # 5 cycles from $5 load
+       stq $2,40($17)
+       xor $7,$20,$20          # 4 cycles from $20 load
+
+       stq $5,48($17)
+       subq $16,1,$16
+       stq $20,56($17)
+       addq $19,64,$19
+
+       addq $18,64,$18
+       addq $17,64,$17
+       bgt $16,3b
+       ret
+       .end xor_alpha_3
+
+       .align 3
+       .ent xor_alpha_4
+xor_alpha_4:
+       .prologue 0
+       srl $16, 6, $16
+       .align 4
+4:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,0($19)
+       ldq $3,0($20)
+
+       ldq $4,8($17)
+       ldq $5,8($18)
+       ldq $6,8($19)
+       ldq $7,8($20)
+
+       ldq $21,16($17)
+       ldq $22,16($18)
+       ldq $23,16($19)
+       ldq $24,16($20)
+
+       ldq $25,24($17)
+       xor $0,$1,$1            # 6 cycles from $1 load
+       ldq $27,24($18)
+       xor $2,$3,$3            # 6 cycles from $3 load
+
+       ldq $0,24($19)
+       xor $1,$3,$3
+       ldq $1,24($20)
+       xor $4,$5,$5            # 7 cycles from $5 load
+
+       stq $3,0($17)
+       xor $6,$7,$7
+       xor $21,$22,$22         # 7 cycles from $22 load
+       xor $5,$7,$7
+
+       stq $7,8($17)
+       xor $23,$24,$24         # 7 cycles from $24 load
+       ldq $2,32($17)
+       xor $22,$24,$24
+
+       ldq $3,32($18)
+       ldq $4,32($19)
+       ldq $5,32($20)
+       xor $25,$27,$27         # 8 cycles from $27 load
+
+       ldq $6,40($17)
+       ldq $7,40($18)
+       ldq $21,40($19)
+       ldq $22,40($20)
+
+       stq $24,16($17)
+       xor $0,$1,$1            # 9 cycles from $1 load
+       xor $2,$3,$3            # 5 cycles from $3 load
+       xor $27,$1,$1
+
+       stq $1,24($17)
+       xor $4,$5,$5            # 5 cycles from $5 load
+       ldq $23,48($17)
+       ldq $24,48($18)
+
+       ldq $25,48($19)
+       xor $3,$5,$5
+       ldq $27,48($20)
+       ldq $0,56($17)
+
+       ldq $1,56($18)
+       ldq $2,56($19)
+       xor $6,$7,$7            # 8 cycles from $6 load
+       ldq $3,56($20)
+
+       stq $5,32($17)
+       xor $21,$22,$22         # 8 cycles from $22 load
+       xor $7,$22,$22
+       xor $23,$24,$24         # 5 cycles from $24 load
+
+       stq $22,40($17)
+       xor $25,$27,$27         # 5 cycles from $27 load
+       xor $24,$27,$27
+       xor $0,$1,$1            # 5 cycles from $1 load
+
+       stq $27,48($17)
+       xor $2,$3,$3            # 4 cycles from $3 load
+       xor $1,$3,$3
+       subq $16,1,$16
+
+       stq $3,56($17)
+       addq $20,64,$20
+       addq $19,64,$19
+       addq $18,64,$18
+
+       addq $17,64,$17
+       bgt $16,4b
+       ret
+       .end xor_alpha_4
+
+       .align 3
+       .ent xor_alpha_5
+xor_alpha_5:
+       .prologue 0
+       srl $16, 6, $16
+       .align 4
+5:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,0($19)
+       ldq $3,0($20)
+
+       ldq $4,0($21)
+       ldq $5,8($17)
+       ldq $6,8($18)
+       ldq $7,8($19)
+
+       ldq $22,8($20)
+       ldq $23,8($21)
+       ldq $24,16($17)
+       ldq $25,16($18)
+
+       ldq $27,16($19)
+       xor $0,$1,$1            # 6 cycles from $1 load
+       ldq $28,16($20)
+       xor $2,$3,$3            # 6 cycles from $3 load
+
+       ldq $0,16($21)
+       xor $1,$3,$3
+       ldq $1,24($17)
+       xor $3,$4,$4            # 7 cycles from $4 load
+
+       stq $4,0($17)
+       xor $5,$6,$6            # 7 cycles from $6 load
+       xor $7,$22,$22          # 7 cycles from $22 load
+       xor $6,$23,$23          # 7 cycles from $23 load
+
+       ldq $2,24($18)
+       xor $22,$23,$23
+       ldq $3,24($19)
+       xor $24,$25,$25         # 8 cycles from $25 load
+
+       stq $23,8($17)
+       xor $25,$27,$27         # 8 cycles from $27 load
+       ldq $4,24($20)
+       xor $28,$0,$0           # 7 cycles from $0 load
+
+       ldq $5,24($21)
+       xor $27,$0,$0
+       ldq $6,32($17)
+       ldq $7,32($18)
+
+       stq $0,16($17)
+       xor $1,$2,$2            # 6 cycles from $2 load
+       ldq $22,32($19)
+       xor $3,$4,$4            # 4 cycles from $4 load
+       
+       ldq $23,32($20)
+       xor $2,$4,$4
+       ldq $24,32($21)
+       ldq $25,40($17)
+
+       ldq $27,40($18)
+       ldq $28,40($19)
+       ldq $0,40($20)
+       xor $4,$5,$5            # 7 cycles from $5 load
+
+       stq $5,24($17)
+       xor $6,$7,$7            # 7 cycles from $7 load
+       ldq $1,40($21)
+       ldq $2,48($17)
+
+       ldq $3,48($18)
+       xor $7,$22,$22          # 7 cycles from $22 load
+       ldq $4,48($19)
+       xor $23,$24,$24         # 6 cycles from $24 load
+
+       ldq $5,48($20)
+       xor $22,$24,$24
+       ldq $6,48($21)
+       xor $25,$27,$27         # 7 cycles from $27 load
+
+       stq $24,32($17)
+       xor $27,$28,$28         # 8 cycles from $28 load
+       ldq $7,56($17)
+       xor $0,$1,$1            # 6 cycles from $1 load
+
+       ldq $22,56($18)
+       ldq $23,56($19)
+       ldq $24,56($20)
+       ldq $25,56($21)
+
+       xor $28,$1,$1
+       xor $2,$3,$3            # 9 cycles from $3 load
+       xor $3,$4,$4            # 9 cycles from $4 load
+       xor $5,$6,$6            # 8 cycles from $6 load
+
+       stq $1,40($17)
+       xor $4,$6,$6
+       xor $7,$22,$22          # 7 cycles from $22 load
+       xor $23,$24,$24         # 6 cycles from $24 load
+
+       stq $6,48($17)
+       xor $22,$24,$24
+       subq $16,1,$16
+       xor $24,$25,$25         # 8 cycles from $25 load
+
+       stq $25,56($17)
+       addq $21,64,$21
+       addq $20,64,$20
+       addq $19,64,$19
+
+       addq $18,64,$18
+       addq $17,64,$17
+       bgt $16,5b
+       ret
+       .end xor_alpha_5
+
+       .align 3
+       .ent xor_alpha_prefetch_2
+xor_alpha_prefetch_2:
+       .prologue 0
+       srl $16, 6, $16
+
+       ldq $31, 0($17)
+       ldq $31, 0($18)
+
+       ldq $31, 64($17)
+       ldq $31, 64($18)
+
+       ldq $31, 128($17)
+       ldq $31, 128($18)
+
+       ldq $31, 192($17)
+       ldq $31, 192($18)
+       .align 4
+2:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,8($17)
+       ldq $3,8($18)
+
+       ldq $4,16($17)
+       ldq $5,16($18)
+       ldq $6,24($17)
+       ldq $7,24($18)
+
+       ldq $19,32($17)
+       ldq $20,32($18)
+       ldq $21,40($17)
+       ldq $22,40($18)
+
+       ldq $23,48($17)
+       ldq $24,48($18)
+       ldq $25,56($17)
+       ldq $27,56($18)
+
+       ldq $31,256($17)
+       xor $0,$1,$0            # 8 cycles from $1 load
+       ldq $31,256($18)
+       xor $2,$3,$2
+
+       stq $0,0($17)
+       xor $4,$5,$4
+       stq $2,8($17)
+       xor $6,$7,$6
+
+       stq $4,16($17)
+       xor $19,$20,$19
+       stq $6,24($17)
+       xor $21,$22,$21
+
+       stq $19,32($17)
+       xor $23,$24,$23
+       stq $21,40($17)
+       xor $25,$27,$25
+
+       stq $23,48($17)
+       subq $16,1,$16
+       stq $25,56($17)
+       addq $17,64,$17
+
+       addq $18,64,$18
+       bgt $16,2b
+       ret
+       .end xor_alpha_prefetch_2
+
+       .align 3
+       .ent xor_alpha_prefetch_3
+xor_alpha_prefetch_3:
+       .prologue 0
+       srl $16, 6, $16
+
+       ldq $31, 0($17)
+       ldq $31, 0($18)
+       ldq $31, 0($19)
+
+       ldq $31, 64($17)
+       ldq $31, 64($18)
+       ldq $31, 64($19)
+
+       ldq $31, 128($17)
+       ldq $31, 128($18)
+       ldq $31, 128($19)
+
+       ldq $31, 192($17)
+       ldq $31, 192($18)
+       ldq $31, 192($19)
+       .align 4
+3:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,0($19)
+       ldq $3,8($17)
+
+       ldq $4,8($18)
+       ldq $6,16($17)
+       ldq $7,16($18)
+       ldq $21,24($17)
+
+       ldq $22,24($18)
+       ldq $24,32($17)
+       ldq $25,32($18)
+       ldq $5,8($19)
+
+       ldq $20,16($19)
+       ldq $23,24($19)
+       ldq $27,32($19)
+       nop
+
+       xor $0,$1,$1            # 8 cycles from $0 load
+       xor $3,$4,$4            # 7 cycles from $4 load
+       xor $6,$7,$7            # 6 cycles from $7 load
+       xor $21,$22,$22         # 5 cycles from $22 load
+
+       xor $1,$2,$2            # 9 cycles from $2 load
+       xor $24,$25,$25         # 5 cycles from $25 load
+       stq $2,0($17)
+       xor $4,$5,$5            # 6 cycles from $5 load
+
+       stq $5,8($17)
+       xor $7,$20,$20          # 7 cycles from $20 load
+       stq $20,16($17)
+       xor $22,$23,$23         # 7 cycles from $23 load
+
+       stq $23,24($17)
+       xor $25,$27,$27         # 7 cycles from $27 load
+       stq $27,32($17)
+       nop
+
+       ldq $0,40($17)
+       ldq $1,40($18)
+       ldq $3,48($17)
+       ldq $4,48($18)
+
+       ldq $6,56($17)
+       ldq $7,56($18)
+       ldq $2,40($19)
+       ldq $5,48($19)
+
+       ldq $20,56($19)
+       ldq $31,256($17)
+       ldq $31,256($18)
+       ldq $31,256($19)
+
+       xor $0,$1,$1            # 6 cycles from $1 load
+       xor $3,$4,$4            # 5 cycles from $4 load
+       xor $6,$7,$7            # 5 cycles from $7 load
+       xor $1,$2,$2            # 4 cycles from $2 load
+       
+       xor $4,$5,$5            # 5 cycles from $5 load
+       xor $7,$20,$20          # 4 cycles from $20 load
+       stq $2,40($17)
+       subq $16,1,$16
+
+       stq $5,48($17)
+       addq $19,64,$19
+       stq $20,56($17)
+       addq $18,64,$18
+
+       addq $17,64,$17
+       bgt $16,3b
+       ret
+       .end xor_alpha_prefetch_3
+
+       .align 3
+       .ent xor_alpha_prefetch_4
+xor_alpha_prefetch_4:
+       .prologue 0
+       srl $16, 6, $16
+
+       ldq $31, 0($17)
+       ldq $31, 0($18)
+       ldq $31, 0($19)
+       ldq $31, 0($20)
+
+       ldq $31, 64($17)
+       ldq $31, 64($18)
+       ldq $31, 64($19)
+       ldq $31, 64($20)
+
+       ldq $31, 128($17)
+       ldq $31, 128($18)
+       ldq $31, 128($19)
+       ldq $31, 128($20)
+
+       ldq $31, 192($17)
+       ldq $31, 192($18)
+       ldq $31, 192($19)
+       ldq $31, 192($20)
+       .align 4
+4:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,0($19)
+       ldq $3,0($20)
+
+       ldq $4,8($17)
+       ldq $5,8($18)
+       ldq $6,8($19)
+       ldq $7,8($20)
+
+       ldq $21,16($17)
+       ldq $22,16($18)
+       ldq $23,16($19)
+       ldq $24,16($20)
+
+       ldq $25,24($17)
+       xor $0,$1,$1            # 6 cycles from $1 load
+       ldq $27,24($18)
+       xor $2,$3,$3            # 6 cycles from $3 load
+
+       ldq $0,24($19)
+       xor $1,$3,$3
+       ldq $1,24($20)
+       xor $4,$5,$5            # 7 cycles from $5 load
+
+       stq $3,0($17)
+       xor $6,$7,$7
+       xor $21,$22,$22         # 7 cycles from $22 load
+       xor $5,$7,$7
+
+       stq $7,8($17)
+       xor $23,$24,$24         # 7 cycles from $24 load
+       ldq $2,32($17)
+       xor $22,$24,$24
+
+       ldq $3,32($18)
+       ldq $4,32($19)
+       ldq $5,32($20)
+       xor $25,$27,$27         # 8 cycles from $27 load
+
+       ldq $6,40($17)
+       ldq $7,40($18)
+       ldq $21,40($19)
+       ldq $22,40($20)
+
+       stq $24,16($17)
+       xor $0,$1,$1            # 9 cycles from $1 load
+       xor $2,$3,$3            # 5 cycles from $3 load
+       xor $27,$1,$1
+
+       stq $1,24($17)
+       xor $4,$5,$5            # 5 cycles from $5 load
+       ldq $23,48($17)
+       xor $3,$5,$5
+
+       ldq $24,48($18)
+       ldq $25,48($19)
+       ldq $27,48($20)
+       ldq $0,56($17)
+
+       ldq $1,56($18)
+       ldq $2,56($19)
+       ldq $3,56($20)
+       xor $6,$7,$7            # 8 cycles from $6 load
+
+       ldq $31,256($17)
+       xor $21,$22,$22         # 8 cycles from $22 load
+       ldq $31,256($18)
+       xor $7,$22,$22
+
+       ldq $31,256($19)
+       xor $23,$24,$24         # 6 cycles from $24 load
+       ldq $31,256($20)
+       xor $25,$27,$27         # 6 cycles from $27 load
+
+       stq $5,32($17)
+       xor $24,$27,$27
+       xor $0,$1,$1            # 7 cycles from $1 load
+       xor $2,$3,$3            # 6 cycles from $3 load
+
+       stq $22,40($17)
+       xor $1,$3,$3
+       stq $27,48($17)
+       subq $16,1,$16
+
+       stq $3,56($17)
+       addq $20,64,$20
+       addq $19,64,$19
+       addq $18,64,$18
+
+       addq $17,64,$17
+       bgt $16,4b
+       ret
+       .end xor_alpha_prefetch_4
+
+       .align 3
+       .ent xor_alpha_prefetch_5
+xor_alpha_prefetch_5:
+       .prologue 0
+       srl $16, 6, $16
+
+       ldq $31, 0($17)
+       ldq $31, 0($18)
+       ldq $31, 0($19)
+       ldq $31, 0($20)
+       ldq $31, 0($21)
+
+       ldq $31, 64($17)
+       ldq $31, 64($18)
+       ldq $31, 64($19)
+       ldq $31, 64($20)
+       ldq $31, 64($21)
+
+       ldq $31, 128($17)
+       ldq $31, 128($18)
+       ldq $31, 128($19)
+       ldq $31, 128($20)
+       ldq $31, 128($21)
+
+       ldq $31, 192($17)
+       ldq $31, 192($18)
+       ldq $31, 192($19)
+       ldq $31, 192($20)
+       ldq $31, 192($21)
+       .align 4
+5:
+       ldq $0,0($17)
+       ldq $1,0($18)
+       ldq $2,0($19)
+       ldq $3,0($20)
+
+       ldq $4,0($21)
+       ldq $5,8($17)
+       ldq $6,8($18)
+       ldq $7,8($19)
+
+       ldq $22,8($20)
+       ldq $23,8($21)
+       ldq $24,16($17)
+       ldq $25,16($18)
+
+       ldq $27,16($19)
+       xor $0,$1,$1            # 6 cycles from $1 load
+       ldq $28,16($20)
+       xor $2,$3,$3            # 6 cycles from $3 load
+
+       ldq $0,16($21)
+       xor $1,$3,$3
+       ldq $1,24($17)
+       xor $3,$4,$4            # 7 cycles from $4 load
+
+       stq $4,0($17)
+       xor $5,$6,$6            # 7 cycles from $6 load
+       xor $7,$22,$22          # 7 cycles from $22 load
+       xor $6,$23,$23          # 7 cycles from $23 load
+
+       ldq $2,24($18)
+       xor $22,$23,$23
+       ldq $3,24($19)
+       xor $24,$25,$25         # 8 cycles from $25 load
+
+       stq $23,8($17)
+       xor $25,$27,$27         # 8 cycles from $27 load
+       ldq $4,24($20)
+       xor $28,$0,$0           # 7 cycles from $0 load
+
+       ldq $5,24($21)
+       xor $27,$0,$0
+       ldq $6,32($17)
+       ldq $7,32($18)
+
+       stq $0,16($17)
+       xor $1,$2,$2            # 6 cycles from $2 load
+       ldq $22,32($19)
+       xor $3,$4,$4            # 4 cycles from $4 load
+       
+       ldq $23,32($20)
+       xor $2,$4,$4
+       ldq $24,32($21)
+       ldq $25,40($17)
+
+       ldq $27,40($18)
+       ldq $28,40($19)
+       ldq $0,40($20)
+       xor $4,$5,$5            # 7 cycles from $5 load
+
+       stq $5,24($17)
+       xor $6,$7,$7            # 7 cycles from $7 load
+       ldq $1,40($21)
+       ldq $2,48($17)
+
+       ldq $3,48($18)
+       xor $7,$22,$22          # 7 cycles from $22 load
+       ldq $4,48($19)
+       xor $23,$24,$24         # 6 cycles from $24 load
+
+       ldq $5,48($20)
+       xor $22,$24,$24
+       ldq $6,48($21)
+       xor $25,$27,$27         # 7 cycles from $27 load
+
+       stq $24,32($17)
+       xor $27,$28,$28         # 8 cycles from $28 load
+       ldq $7,56($17)
+       xor $0,$1,$1            # 6 cycles from $1 load
+
+       ldq $22,56($18)
+       ldq $23,56($19)
+       ldq $24,56($20)
+       ldq $25,56($21)
+
+       ldq $31,256($17)
+       xor $28,$1,$1
+       ldq $31,256($18)
+       xor $2,$3,$3            # 9 cycles from $3 load
+
+       ldq $31,256($19)
+       xor $3,$4,$4            # 9 cycles from $4 load
+       ldq $31,256($20)
+       xor $5,$6,$6            # 8 cycles from $6 load
+
+       stq $1,40($17)
+       xor $4,$6,$6
+       xor $7,$22,$22          # 7 cycles from $22 load
+       xor $23,$24,$24         # 6 cycles from $24 load
+
+       stq $6,48($17)
+       xor $22,$24,$24
+       ldq $31,256($21)
+       xor $24,$25,$25         # 8 cycles from $25 load
+
+       stq $25,56($17)
+       subq $16,1,$16
+       addq $21,64,$21
+       addq $20,64,$20
+
+       addq $19,64,$19
+       addq $18,64,$18
+       addq $17,64,$17
+       bgt $16,5b
+
+       ret
+       .end xor_alpha_prefetch_5
+");
+
+static struct xor_block_template xor_block_alpha = {
+       name: "alpha",
+       do_2: xor_alpha_2,
+       do_3: xor_alpha_3,
+       do_4: xor_alpha_4,
+       do_5: xor_alpha_5,
+};
+
+static struct xor_block_template xor_block_alpha_prefetch = {
+       name: "alpha prefetch",
+       do_2: xor_alpha_prefetch_2,
+       do_3: xor_alpha_prefetch_3,
+       do_4: xor_alpha_prefetch_4,
+       do_5: xor_alpha_prefetch_5,
+};
+
+/* For grins, also test the generic routines.  */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES                              \
+       do {                                            \
+               xor_speed(&xor_block_8regs);            \
+               xor_speed(&xor_block_32regs);           \
+               xor_speed(&xor_block_alpha);            \
+               xor_speed(&xor_block_alpha_prefetch);   \
+       } while (0)
+
+/* Force the use of alpha_prefetch if EV6, as it is significantly
+   faster in the cold cache case.  */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+       (implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
diff --git a/include/asm-arm/xor.h b/include/asm-arm/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
new file mode 100644 (file)
index 0000000..ebda0f9
--- /dev/null
@@ -0,0 +1,322 @@
+/*
+ * include/asm-generic/xor.h
+ *
+ * Generic optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+static void
+xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               p1[0] ^= p2[0];
+               p1[1] ^= p2[1];
+               p1[2] ^= p2[2];
+               p1[3] ^= p2[3];
+               p1[4] ^= p2[4];
+               p1[5] ^= p2[5];
+               p1[6] ^= p2[6];
+               p1[7] ^= p2[7];
+               p1 += 8;
+               p2 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+           unsigned long *p3)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               p1[0] ^= p2[0] ^ p3[0];
+               p1[1] ^= p2[1] ^ p3[1];
+               p1[2] ^= p2[2] ^ p3[2];
+               p1[3] ^= p2[3] ^ p3[3];
+               p1[4] ^= p2[4] ^ p3[4];
+               p1[5] ^= p2[5] ^ p3[5];
+               p1[6] ^= p2[6] ^ p3[6];
+               p1[7] ^= p2[7] ^ p3[7];
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+           unsigned long *p3, unsigned long *p4)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
+               p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
+               p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
+               p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
+               p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
+               p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
+               p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
+               p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+               p4 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+           unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
+               p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
+               p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
+               p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
+               p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
+               p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
+               p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
+               p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+               p4 += 8;
+               p5 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               register long d0, d1, d2, d3, d4, d5, d6, d7;
+               d0 = p1[0];     /* Pull the stuff into registers        */
+               d1 = p1[1];     /*  ... in bursts, if possible.         */
+               d2 = p1[2];
+               d3 = p1[3];
+               d4 = p1[4];
+               d5 = p1[5];
+               d6 = p1[6];
+               d7 = p1[7];
+               d0 ^= p2[0];
+               d1 ^= p2[1];
+               d2 ^= p2[2];
+               d3 ^= p2[3];
+               d4 ^= p2[4];
+               d5 ^= p2[5];
+               d6 ^= p2[6];
+               d7 ^= p2[7];
+               p1[0] = d0;     /* Store the result (in burts)          */
+               p1[1] = d1;
+               p1[2] = d2;
+               p1[3] = d3;
+               p1[4] = d4;
+               p1[5] = d5;
+               p1[6] = d6;
+               p1[7] = d7;
+               p1 += 8;
+               p2 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+           unsigned long *p3)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               register long d0, d1, d2, d3, d4, d5, d6, d7;
+               d0 = p1[0];     /* Pull the stuff into registers        */
+               d1 = p1[1];     /*  ... in bursts, if possible.         */
+               d2 = p1[2];
+               d3 = p1[3];
+               d4 = p1[4];
+               d5 = p1[5];
+               d6 = p1[6];
+               d7 = p1[7];
+               d0 ^= p2[0];
+               d1 ^= p2[1];
+               d2 ^= p2[2];
+               d3 ^= p2[3];
+               d4 ^= p2[4];
+               d5 ^= p2[5];
+               d6 ^= p2[6];
+               d7 ^= p2[7];
+               d0 ^= p3[0];
+               d1 ^= p3[1];
+               d2 ^= p3[2];
+               d3 ^= p3[3];
+               d4 ^= p3[4];
+               d5 ^= p3[5];
+               d6 ^= p3[6];
+               d7 ^= p3[7];
+               p1[0] = d0;     /* Store the result (in burts)          */
+               p1[1] = d1;
+               p1[2] = d2;
+               p1[3] = d3;
+               p1[4] = d4;
+               p1[5] = d5;
+               p1[6] = d6;
+               p1[7] = d7;
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+           unsigned long *p3, unsigned long *p4)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               register long d0, d1, d2, d3, d4, d5, d6, d7;
+               d0 = p1[0];     /* Pull the stuff into registers        */
+               d1 = p1[1];     /*  ... in bursts, if possible.         */
+               d2 = p1[2];
+               d3 = p1[3];
+               d4 = p1[4];
+               d5 = p1[5];
+               d6 = p1[6];
+               d7 = p1[7];
+               d0 ^= p2[0];
+               d1 ^= p2[1];
+               d2 ^= p2[2];
+               d3 ^= p2[3];
+               d4 ^= p2[4];
+               d5 ^= p2[5];
+               d6 ^= p2[6];
+               d7 ^= p2[7];
+               d0 ^= p3[0];
+               d1 ^= p3[1];
+               d2 ^= p3[2];
+               d3 ^= p3[3];
+               d4 ^= p3[4];
+               d5 ^= p3[5];
+               d6 ^= p3[6];
+               d7 ^= p3[7];
+               d0 ^= p4[0];
+               d1 ^= p4[1];
+               d2 ^= p4[2];
+               d3 ^= p4[3];
+               d4 ^= p4[4];
+               d5 ^= p4[5];
+               d6 ^= p4[6];
+               d7 ^= p4[7];
+               p1[0] = d0;     /* Store the result (in burts)          */
+               p1[1] = d1;
+               p1[2] = d2;
+               p1[3] = d3;
+               p1[4] = d4;
+               p1[5] = d5;
+               p1[6] = d6;
+               p1[7] = d7;
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+               p4 += 8;
+       } while (--lines > 0);
+}
+
+static void
+xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+           unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       long lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               register long d0, d1, d2, d3, d4, d5, d6, d7;
+               d0 = p1[0];     /* Pull the stuff into registers        */
+               d1 = p1[1];     /*  ... in bursts, if possible.         */
+               d2 = p1[2];
+               d3 = p1[3];
+               d4 = p1[4];
+               d5 = p1[5];
+               d6 = p1[6];
+               d7 = p1[7];
+               d0 ^= p2[0];
+               d1 ^= p2[1];
+               d2 ^= p2[2];
+               d3 ^= p2[3];
+               d4 ^= p2[4];
+               d5 ^= p2[5];
+               d6 ^= p2[6];
+               d7 ^= p2[7];
+               d0 ^= p3[0];
+               d1 ^= p3[1];
+               d2 ^= p3[2];
+               d3 ^= p3[3];
+               d4 ^= p3[4];
+               d5 ^= p3[5];
+               d6 ^= p3[6];
+               d7 ^= p3[7];
+               d0 ^= p4[0];
+               d1 ^= p4[1];
+               d2 ^= p4[2];
+               d3 ^= p4[3];
+               d4 ^= p4[4];
+               d5 ^= p4[5];
+               d6 ^= p4[6];
+               d7 ^= p4[7];
+               d0 ^= p5[0];
+               d1 ^= p5[1];
+               d2 ^= p5[2];
+               d3 ^= p5[3];
+               d4 ^= p5[4];
+               d5 ^= p5[5];
+               d6 ^= p5[6];
+               d7 ^= p5[7];
+               p1[0] = d0;     /* Store the result (in burts)          */
+               p1[1] = d1;
+               p1[2] = d2;
+               p1[3] = d3;
+               p1[4] = d4;
+               p1[5] = d5;
+               p1[6] = d6;
+               p1[7] = d7;
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+               p4 += 8;
+               p5 += 8;
+       } while (--lines > 0);
+}
+
+static struct xor_block_template xor_block_8regs = {
+       name: "8regs",
+       do_2: xor_8regs_2,
+       do_3: xor_8regs_3,
+       do_4: xor_8regs_4,
+       do_5: xor_8regs_5,
+};
+
+static struct xor_block_template xor_block_32regs = {
+       name: "32regs",
+       do_2: xor_32regs_2,
+       do_3: xor_32regs_3,
+       do_4: xor_32regs_4,
+       do_5: xor_32regs_5,
+};
+
+#define XOR_TRY_TEMPLATES                      \
+       do {                                    \
+               xor_speed(&xor_block_8regs);    \
+               xor_speed(&xor_block_32regs);   \
+       } while (0)
diff --git a/include/asm-i386/xor.h b/include/asm-i386/xor.h
new file mode 100644 (file)
index 0000000..6a2230b
--- /dev/null
@@ -0,0 +1,858 @@
+/*
+ * include/asm-i386/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+#define FPU_SAVE                                                       \
+  do {                                                                 \
+       if (!(current->flags & PF_USEDFPU))                             \
+               __asm__ __volatile__ (" clts;\n");                      \
+       __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));    \
+  } while (0)
+
+#define FPU_RESTORE                                                    \
+  do {                                                                 \
+       __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));         \
+       if (!(current->flags & PF_USEDFPU))                             \
+               stts();                                                 \
+  } while (0)
+
+#define LD(x,y)                "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
+#define ST(x,y)                "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
+#define XO1(x,y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
+#define XO2(x,y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
+#define XO3(x,y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
+#define XO4(x,y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
+
+
+static void
+xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       unsigned long lines = bytes >> 7;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+       LD(i,0)                                 \
+               LD(i+1,1)                       \
+                       LD(i+2,2)               \
+                               LD(i+3,3)       \
+       XO1(i,0)                                \
+       ST(i,0)                                 \
+               XO1(i+1,1)                      \
+               ST(i+1,1)                       \
+                       XO1(i+2,2)              \
+                       ST(i+2,2)               \
+                               XO1(i+3,3)      \
+                               ST(i+3,3)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+       BLOCK(0)
+       BLOCK(4)
+       BLOCK(8)
+       BLOCK(12)
+
+       "       addl $128, %1         ;\n"
+       "       addl $128, %2         ;\n"
+       "       decl %0               ;\n"
+       "       jnz 1b                ;\n"
+               :
+       : "r" (lines),
+         "r" (p1), "r" (p2)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+             unsigned long *p3)
+{
+       unsigned long lines = bytes >> 7;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+       LD(i,0)                                 \
+               LD(i+1,1)                       \
+                       LD(i+2,2)               \
+                               LD(i+3,3)       \
+       XO1(i,0)                                \
+               XO1(i+1,1)                      \
+                       XO1(i+2,2)              \
+                               XO1(i+3,3)      \
+       XO2(i,0)                                \
+       ST(i,0)                                 \
+               XO2(i+1,1)                      \
+               ST(i+1,1)                       \
+                       XO2(i+2,2)              \
+                       ST(i+2,2)               \
+                               XO2(i+3,3)      \
+                               ST(i+3,3)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+       BLOCK(0)
+       BLOCK(4)
+       BLOCK(8)
+       BLOCK(12)
+
+       "       addl $128, %1         ;\n"
+       "       addl $128, %2         ;\n"
+       "       addl $128, %3         ;\n"
+       "       decl %0               ;\n"
+       "       jnz 1b                ;\n"
+               :
+       : "r" (lines),
+         "r" (p1), "r" (p2), "r" (p3)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+             unsigned long *p3, unsigned long *p4)
+{
+       unsigned long lines = bytes >> 7;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+       LD(i,0)                                 \
+               LD(i+1,1)                       \
+                       LD(i+2,2)               \
+                               LD(i+3,3)       \
+       XO1(i,0)                                \
+               XO1(i+1,1)                      \
+                       XO1(i+2,2)              \
+                               XO1(i+3,3)      \
+       XO2(i,0)                                \
+               XO2(i+1,1)                      \
+                       XO2(i+2,2)              \
+                               XO2(i+3,3)      \
+       XO3(i,0)                                \
+       ST(i,0)                                 \
+               XO3(i+1,1)                      \
+               ST(i+1,1)                       \
+                       XO3(i+2,2)              \
+                       ST(i+2,2)               \
+                               XO3(i+3,3)      \
+                               ST(i+3,3)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+       BLOCK(0)
+       BLOCK(4)
+       BLOCK(8)
+       BLOCK(12)
+
+       "       addl $128, %1         ;\n"
+       "       addl $128, %2         ;\n"
+       "       addl $128, %3         ;\n"
+       "       addl $128, %4         ;\n"
+       "       decl %0               ;\n"
+       "       jnz 1b                ;\n"
+               :
+       : "r" (lines),
+         "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+static void
+xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+             unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       unsigned long lines = bytes >> 7;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+       LD(i,0)                                 \
+               LD(i+1,1)                       \
+                       LD(i+2,2)               \
+                               LD(i+3,3)       \
+       XO1(i,0)                                \
+               XO1(i+1,1)                      \
+                       XO1(i+2,2)              \
+                               XO1(i+3,3)      \
+       XO2(i,0)                                \
+               XO2(i+1,1)                      \
+                       XO2(i+2,2)              \
+                               XO2(i+3,3)      \
+       XO3(i,0)                                \
+               XO3(i+1,1)                      \
+                       XO3(i+2,2)              \
+                               XO3(i+3,3)      \
+       XO4(i,0)                                \
+       ST(i,0)                                 \
+               XO4(i+1,1)                      \
+               ST(i+1,1)                       \
+                       XO4(i+2,2)              \
+                       ST(i+2,2)               \
+                               XO4(i+3,3)      \
+                               ST(i+3,3)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+       BLOCK(0)
+       BLOCK(4)
+       BLOCK(8)
+       BLOCK(12)
+
+       "       addl $128, %1         ;\n"
+       "       addl $128, %2         ;\n"
+       "       addl $128, %3         ;\n"
+       "       addl $128, %4         ;\n"
+       "       addl $128, %5         ;\n"
+       "       decl %0               ;\n"
+       "       jnz 1b                ;\n"
+               :
+       : "g" (lines),
+         "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+static void
+xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       unsigned long lines = bytes >> 6;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+       " .align 32                  ;\n"
+       " 1:                         ;\n"
+       "       movq   (%1), %%mm0   ;\n"
+       "       movq  8(%1), %%mm1   ;\n"
+       "       pxor   (%2), %%mm0   ;\n"
+       "       movq 16(%1), %%mm2   ;\n"
+       "       movq %%mm0,   (%1)   ;\n"
+       "       pxor  8(%2), %%mm1   ;\n"
+       "       movq 24(%1), %%mm3   ;\n"
+       "       movq %%mm1,  8(%1)   ;\n"
+       "       pxor 16(%2), %%mm2   ;\n"
+       "       movq 32(%1), %%mm4   ;\n"
+       "       movq %%mm2, 16(%1)   ;\n"
+       "       pxor 24(%2), %%mm3   ;\n"
+       "       movq 40(%1), %%mm5   ;\n"
+       "       movq %%mm3, 24(%1)   ;\n"
+       "       pxor 32(%2), %%mm4   ;\n"
+       "       movq 48(%1), %%mm6   ;\n"
+       "       movq %%mm4, 32(%1)   ;\n"
+       "       pxor 40(%2), %%mm5   ;\n"
+       "       movq 56(%1), %%mm7   ;\n"
+       "       movq %%mm5, 40(%1)   ;\n"
+       "       pxor 48(%2), %%mm6   ;\n"
+       "       pxor 56(%2), %%mm7   ;\n"
+       "       movq %%mm6, 48(%1)   ;\n"
+       "       movq %%mm7, 56(%1)   ;\n"
+       
+       "       addl $64, %1         ;\n"
+       "       addl $64, %2         ;\n"
+       "       decl %0              ;\n"
+       "       jnz 1b               ;\n"
+       : 
+       : "r" (lines),
+         "r" (p1), "r" (p2)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+            unsigned long *p3)
+{
+       unsigned long lines = bytes >> 6;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+       " .align 32,0x90             ;\n"
+       " 1:                         ;\n"
+       "       movq   (%1), %%mm0   ;\n"
+       "       movq  8(%1), %%mm1   ;\n"
+       "       pxor   (%2), %%mm0   ;\n"
+       "       movq 16(%1), %%mm2   ;\n"
+       "       pxor  8(%2), %%mm1   ;\n"
+       "       pxor   (%3), %%mm0   ;\n"
+       "       pxor 16(%2), %%mm2   ;\n"
+       "       movq %%mm0,   (%1)   ;\n"
+       "       pxor  8(%3), %%mm1   ;\n"
+       "       pxor 16(%3), %%mm2   ;\n"
+       "       movq 24(%1), %%mm3   ;\n"
+       "       movq %%mm1,  8(%1)   ;\n"
+       "       movq 32(%1), %%mm4   ;\n"
+       "       movq 40(%1), %%mm5   ;\n"
+       "       pxor 24(%2), %%mm3   ;\n"
+       "       movq %%mm2, 16(%1)   ;\n"
+       "       pxor 32(%2), %%mm4   ;\n"
+       "       pxor 24(%3), %%mm3   ;\n"
+       "       pxor 40(%2), %%mm5   ;\n"
+       "       movq %%mm3, 24(%1)   ;\n"
+       "       pxor 32(%3), %%mm4   ;\n"
+       "       pxor 40(%3), %%mm5   ;\n"
+       "       movq 48(%1), %%mm6   ;\n"
+       "       movq %%mm4, 32(%1)   ;\n"
+       "       movq 56(%1), %%mm7   ;\n"
+       "       pxor 48(%2), %%mm6   ;\n"
+       "       movq %%mm5, 40(%1)   ;\n"
+       "       pxor 56(%2), %%mm7   ;\n"
+       "       pxor 48(%3), %%mm6   ;\n"
+       "       pxor 56(%3), %%mm7   ;\n"
+       "       movq %%mm6, 48(%1)   ;\n"
+       "       movq %%mm7, 56(%1)   ;\n"
+      
+       "       addl $64, %1         ;\n"
+       "       addl $64, %2         ;\n"
+       "       addl $64, %3         ;\n"
+       "       decl %0              ;\n"
+       "       jnz 1b               ;\n"
+       : 
+       : "r" (lines),
+         "r" (p1), "r" (p2), "r" (p3)
+       : "memory" );
+
+       FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+            unsigned long *p3, unsigned long *p4)
+{
+       unsigned long lines = bytes >> 6;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+       " .align 32,0x90             ;\n"
+       " 1:                         ;\n"
+       "       movq   (%1), %%mm0   ;\n"
+       "       movq  8(%1), %%mm1   ;\n"
+       "       pxor   (%2), %%mm0   ;\n"
+       "       movq 16(%1), %%mm2   ;\n"
+       "       pxor  8(%2), %%mm1   ;\n"
+       "       pxor   (%3), %%mm0   ;\n"
+       "       pxor 16(%2), %%mm2   ;\n"
+       "       pxor  8(%3), %%mm1   ;\n"
+       "       pxor   (%4), %%mm0   ;\n"
+       "       movq 24(%1), %%mm3   ;\n"
+       "       pxor 16(%3), %%mm2   ;\n"
+       "       pxor  8(%4), %%mm1   ;\n"
+       "       movq %%mm0,   (%1)   ;\n"
+       "       movq 32(%1), %%mm4   ;\n"
+       "       pxor 24(%2), %%mm3   ;\n"
+       "       pxor 16(%4), %%mm2   ;\n"
+       "       movq %%mm1,  8(%1)   ;\n"
+       "       movq 40(%1), %%mm5   ;\n"
+       "       pxor 32(%2), %%mm4   ;\n"
+       "       pxor 24(%3), %%mm3   ;\n"
+       "       movq %%mm2, 16(%1)   ;\n"
+       "       pxor 40(%2), %%mm5   ;\n"
+       "       pxor 32(%3), %%mm4   ;\n"
+       "       pxor 24(%4), %%mm3   ;\n"
+       "       movq %%mm3, 24(%1)   ;\n"
+       "       movq 56(%1), %%mm7   ;\n"
+       "       movq 48(%1), %%mm6   ;\n"
+       "       pxor 40(%3), %%mm5   ;\n"
+       "       pxor 32(%4), %%mm4   ;\n"
+       "       pxor 48(%2), %%mm6   ;\n"
+       "       movq %%mm4, 32(%1)   ;\n"
+       "       pxor 56(%2), %%mm7   ;\n"
+       "       pxor 40(%4), %%mm5   ;\n"
+       "       pxor 48(%3), %%mm6   ;\n"
+       "       pxor 56(%3), %%mm7   ;\n"
+       "       movq %%mm5, 40(%1)   ;\n"
+       "       pxor 48(%4), %%mm6   ;\n"
+       "       pxor 56(%4), %%mm7   ;\n"
+       "       movq %%mm6, 48(%1)   ;\n"
+       "       movq %%mm7, 56(%1)   ;\n"
+      
+       "       addl $64, %1         ;\n"
+       "       addl $64, %2         ;\n"
+       "       addl $64, %3         ;\n"
+       "       addl $64, %4         ;\n"
+       "       decl %0              ;\n"
+       "       jnz 1b               ;\n"
+       : 
+       : "r" (lines),
+         "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+static void
+xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+            unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       unsigned long lines = bytes >> 6;
+       char fpu_save[108];
+
+       FPU_SAVE;
+
+       __asm__ __volatile__ (
+       " .align 32,0x90             ;\n"
+       " 1:                         ;\n"
+       "       movq   (%1), %%mm0   ;\n"
+       "       movq  8(%1), %%mm1   ;\n"
+       "       pxor   (%2), %%mm0   ;\n"
+       "       pxor  8(%2), %%mm1   ;\n"
+       "       movq 16(%1), %%mm2   ;\n"
+       "       pxor   (%3), %%mm0   ;\n"
+       "       pxor  8(%3), %%mm1   ;\n"
+       "       pxor 16(%2), %%mm2   ;\n"
+       "       pxor   (%4), %%mm0   ;\n"
+       "       pxor  8(%4), %%mm1   ;\n"
+       "       pxor 16(%3), %%mm2   ;\n"
+       "       movq 24(%1), %%mm3   ;\n"
+       "       pxor   (%5), %%mm0   ;\n"
+       "       pxor  8(%5), %%mm1   ;\n"
+       "       movq %%mm0,   (%1)   ;\n"
+       "       pxor 16(%4), %%mm2   ;\n"
+       "       pxor 24(%2), %%mm3   ;\n"
+       "       movq %%mm1,  8(%1)   ;\n"
+       "       pxor 16(%5), %%mm2   ;\n"
+       "       pxor 24(%3), %%mm3   ;\n"
+       "       movq 32(%1), %%mm4   ;\n"
+       "       movq %%mm2, 16(%1)   ;\n"
+       "       pxor 24(%4), %%mm3   ;\n"
+       "       pxor 32(%2), %%mm4   ;\n"
+       "       movq 40(%1), %%mm5   ;\n"
+       "       pxor 24(%5), %%mm3   ;\n"
+       "       pxor 32(%3), %%mm4   ;\n"
+       "       pxor 40(%2), %%mm5   ;\n"
+       "       movq %%mm3, 24(%1)   ;\n"
+       "       pxor 32(%4), %%mm4   ;\n"
+       "       pxor 40(%3), %%mm5   ;\n"
+       "       movq 48(%1), %%mm6   ;\n"
+       "       movq 56(%1), %%mm7   ;\n"
+       "       pxor 32(%5), %%mm4   ;\n"
+       "       pxor 40(%4), %%mm5   ;\n"
+       "       pxor 48(%2), %%mm6   ;\n"
+       "       pxor 56(%2), %%mm7   ;\n"
+       "       movq %%mm4, 32(%1)   ;\n"
+       "       pxor 48(%3), %%mm6   ;\n"
+       "       pxor 56(%3), %%mm7   ;\n"
+       "       pxor 40(%5), %%mm5   ;\n"
+       "       pxor 48(%4), %%mm6   ;\n"
+       "       pxor 56(%4), %%mm7   ;\n"
+       "       movq %%mm5, 40(%1)   ;\n"
+       "       pxor 48(%5), %%mm6   ;\n"
+       "       pxor 56(%5), %%mm7   ;\n"
+       "       movq %%mm6, 48(%1)   ;\n"
+       "       movq %%mm7, 56(%1)   ;\n"
+      
+       "       addl $64, %1         ;\n"
+       "       addl $64, %2         ;\n"
+       "       addl $64, %3         ;\n"
+       "       addl $64, %4         ;\n"
+       "       addl $64, %5         ;\n"
+       "       decl %0              ;\n"
+       "       jnz 1b               ;\n"
+       : 
+       : "g" (lines),
+         "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+       : "memory");
+
+       FPU_RESTORE;
+}
+
+static struct xor_block_template xor_block_pII_mmx = {
+       name: "pII_mmx",
+       do_2: xor_pII_mmx_2,
+       do_3: xor_pII_mmx_3,
+       do_4: xor_pII_mmx_4,
+       do_5: xor_pII_mmx_5,
+};
+
+static struct xor_block_template xor_block_p5_mmx = {
+       name: "p5_mmx",
+       do_2: xor_p5_mmx_2,
+       do_3: xor_p5_mmx_3,
+       do_4: xor_p5_mmx_4,
+       do_5: xor_p5_mmx_5,
+};
+
+#undef FPU_SAVE
+#undef FPU_RESTORE
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+#define XMMS_SAVE                              \
+       __asm__ __volatile__ (                  \
+               "movl %%cr0,%0          ;\n\t"  \
+               "clts                   ;\n\t"  \
+               "movups %%xmm0,(%1)     ;\n\t"  \
+               "movups %%xmm1,0x10(%1) ;\n\t"  \
+               "movups %%xmm2,0x20(%1) ;\n\t"  \
+               "movups %%xmm3,0x30(%1) ;\n\t"  \
+               : "=r" (cr0)                    \
+               : "r" (xmm_save)                \
+               : "memory")
+
+#define XMMS_RESTORE                           \
+       __asm__ __volatile__ (                  \
+               "sfence                 ;\n\t"  \
+               "movups (%1),%%xmm0     ;\n\t"  \
+               "movups 0x10(%1),%%xmm1 ;\n\t"  \
+               "movups 0x20(%1),%%xmm2 ;\n\t"  \
+               "movups 0x30(%1),%%xmm3 ;\n\t"  \
+               "movl   %0,%%cr0        ;\n\t"  \
+               :                               \
+               : "r" (cr0), "r" (xmm_save)     \
+               : "memory")
+
+#define OFFS(x)                "16*("#x")"
+#define        PF0(x)          "       prefetcht0  "OFFS(x)"(%1)   ;\n"
+#define LD(x,y)                "       movaps   "OFFS(x)"(%1), %%xmm"#y"   ;\n"
+#define ST(x,y)                "       movaps %%xmm"#y",   "OFFS(x)"(%1)   ;\n"
+#define PF1(x)         "       prefetchnta "OFFS(x)"(%2)   ;\n"
+#define PF2(x)         "       prefetchnta "OFFS(x)"(%3)   ;\n"
+#define PF3(x)         "       prefetchnta "OFFS(x)"(%4)   ;\n"
+#define PF4(x)         "       prefetchnta "OFFS(x)"(%5)   ;\n"
+#define PF5(x)         "       prefetchnta "OFFS(x)"(%6)   ;\n"
+#define XO1(x,y)       "       xorps   "OFFS(x)"(%2), %%xmm"#y"   ;\n"
+#define XO2(x,y)       "       xorps   "OFFS(x)"(%3), %%xmm"#y"   ;\n"
+#define XO3(x,y)       "       xorps   "OFFS(x)"(%4), %%xmm"#y"   ;\n"
+#define XO4(x,y)       "       xorps   "OFFS(x)"(%5), %%xmm"#y"   ;\n"
+#define XO5(x,y)       "       xorps   "OFFS(x)"(%6), %%xmm"#y"   ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+        unsigned long lines = bytes >> 8;
+       char xmm_save[16*4];
+       int cr0;
+
+       XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+               LD(i,0)                                 \
+                       LD(i+1,1)                       \
+               PF1(i)                                  \
+                               PF1(i+2)                \
+                               LD(i+2,2)               \
+                                       LD(i+3,3)       \
+               PF0(i+4)                                \
+                               PF0(i+6)                \
+               XO1(i,0)                                \
+                       XO1(i+1,1)                      \
+                               XO1(i+2,2)              \
+                                       XO1(i+3,3)      \
+               ST(i,0)                                 \
+                       ST(i+1,1)                       \
+                               ST(i+2,2)               \
+                                       ST(i+3,3)       \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+        " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+       :
+       : "r" (lines),
+         "r" (p1), "r" (p2)
+        : "memory");
+
+       XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+         unsigned long *p3)
+{
+        unsigned long lines = bytes >> 8;
+       char xmm_save[16*4];
+       int cr0;
+
+       XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+               PF1(i)                                  \
+                               PF1(i+2)                \
+               LD(i,0)                                 \
+                       LD(i+1,1)                       \
+                               LD(i+2,2)               \
+                                       LD(i+3,3)       \
+               PF2(i)                                  \
+                               PF2(i+2)                \
+               PF0(i+4)                                \
+                               PF0(i+6)                \
+               XO1(i,0)                                \
+                       XO1(i+1,1)                      \
+                               XO1(i+2,2)              \
+                                       XO1(i+3,3)      \
+               XO2(i,0)                                \
+                       XO2(i+1,1)                      \
+                               XO2(i+2,2)              \
+                                       XO2(i+3,3)      \
+               ST(i,0)                                 \
+                       ST(i+1,1)                       \
+                               ST(i+2,2)               \
+                                       ST(i+3,3)       \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+        " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+       :
+       : "r" (lines),
+         "r" (p1), "r"(p2), "r"(p3)
+        : "memory" );
+
+       XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+         unsigned long *p3, unsigned long *p4)
+{
+        unsigned long lines = bytes >> 8;
+       char xmm_save[16*4];
+       int cr0;
+
+       XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+               PF1(i)                                  \
+                               PF1(i+2)                \
+               LD(i,0)                                 \
+                       LD(i+1,1)                       \
+                               LD(i+2,2)               \
+                                       LD(i+3,3)       \
+               PF2(i)                                  \
+                               PF2(i+2)                \
+               XO1(i,0)                                \
+                       XO1(i+1,1)                      \
+                               XO1(i+2,2)              \
+                                       XO1(i+3,3)      \
+               PF3(i)                                  \
+                               PF3(i+2)                \
+               PF0(i+4)                                \
+                               PF0(i+6)                \
+               XO2(i,0)                                \
+                       XO2(i+1,1)                      \
+                               XO2(i+2,2)              \
+                                       XO2(i+3,3)      \
+               XO3(i,0)                                \
+                       XO3(i+1,1)                      \
+                               XO3(i+2,2)              \
+                                       XO3(i+3,3)      \
+               ST(i,0)                                 \
+                       ST(i+1,1)                       \
+                               ST(i+2,2)               \
+                                       ST(i+3,3)       \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+        " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+       :
+       : "r" (lines),
+         "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+        : "memory" );
+
+       XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+         unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+        unsigned long lines = bytes >> 8;
+       char xmm_save[16*4];
+       int cr0;
+
+       XMMS_SAVE;
+
+        __asm__ __volatile__ (
+#undef BLOCK
+#define BLOCK(i) \
+               PF1(i)                                  \
+                               PF1(i+2)                \
+               LD(i,0)                                 \
+                       LD(i+1,1)                       \
+                               LD(i+2,2)               \
+                                       LD(i+3,3)       \
+               PF2(i)                                  \
+                               PF2(i+2)                \
+               XO1(i,0)                                \
+                       XO1(i+1,1)                      \
+                               XO1(i+2,2)              \
+                                       XO1(i+3,3)      \
+               PF3(i)                                  \
+                               PF3(i+2)                \
+               XO2(i,0)                                \
+                       XO2(i+1,1)                      \
+                               XO2(i+2,2)              \
+                                       XO2(i+3,3)      \
+               PF4(i)                                  \
+                               PF4(i+2)                \
+               PF0(i+4)                                \
+                               PF0(i+6)                \
+               XO3(i,0)                                \
+                       XO3(i+1,1)                      \
+                               XO3(i+2,2)              \
+                                       XO3(i+3,3)      \
+               XO4(i,0)                                \
+                       XO4(i+1,1)                      \
+                               XO4(i+2,2)              \
+                                       XO4(i+3,3)      \
+               ST(i,0)                                 \
+                       ST(i+1,1)                       \
+                               ST(i+2,2)               \
+                                       ST(i+3,3)       \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+        " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+        "       addl $256, %1           ;\n"
+        "       addl $256, %2           ;\n"
+        "       addl $256, %3           ;\n"
+        "       addl $256, %4           ;\n"
+        "       addl $256, %5           ;\n"
+        "       decl %0                 ;\n"
+        "       jnz 1b                  ;\n"
+       :
+       : "r" (lines),
+         "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+       : "memory");
+
+       XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_pIII_sse = {
+        name: "pIII_sse",
+        do_2: xor_sse_2,
+        do_3: xor_sse_3,
+        do_4: xor_sse_4,
+        do_5: xor_sse_5,
+};
+
+/* Also try the generic routines.  */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES                              \
+       do {                                            \
+               xor_speed(&xor_block_8regs);            \
+               xor_speed(&xor_block_32regs);           \
+               if (cpu_has_xmm)                        \
+                       xor_speed(&xor_block_pIII_sse); \
+               if (md_cpu_has_mmx()) {                 \
+                       xor_speed(&xor_block_pII_mmx);  \
+                       xor_speed(&xor_block_p5_mmx);   \
+               }                                       \
+       } while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+       (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
diff --git a/include/asm-ia64/xor.h b/include/asm-ia64/xor.h
new file mode 100644 (file)
index 0000000..28aca66
--- /dev/null
@@ -0,0 +1,283 @@
+/*
+ * include/asm-ia64/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for IA-64.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *,
+                      unsigned long *);
+extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *,
+                      unsigned long *, unsigned long *);
+extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *,
+                      unsigned long *, unsigned long *, unsigned long *);
+
+asm ("
+       .text
+
+       // Assume L2 memory latency of 6 cycles.
+
+       .proc xor_ia64_2
+xor_ia64_2:
+       .prologue
+       .fframe 0
+       { .mii
+         .save ar.pfs, r31
+         alloc r31 = ar.pfs, 3, 0, 13, 16
+         .save ar.lc, r30
+         mov r30 = ar.lc
+         .save pr, r29
+         mov r29 = pr
+         ;;
+       }
+       .body
+       { .mii
+         mov r8 = in1
+         mov ar.ec = 6 + 2
+         shr in0 = in0, 3
+         ;;
+       }
+       { .mmi
+         adds in0 = -1, in0
+         mov r16 = in1
+         mov r17 = in2
+         ;;
+       }
+       { .mii
+         mov ar.lc = in0
+         mov pr.rot = 1 << 16
+         ;;
+       }
+       .rotr s1[6+1], s2[6+1], d[2]
+       .rotp p[6+2]
+0:      { .mmi
+(p[0])   ld8.nta s1[0] = [r16], 8
+(p[0])   ld8.nta s2[0] = [r17], 8
+(p[6])   xor d[0] = s1[6], s2[6]
+       }
+       { .mfb
+(p[6+1])  st8.nta [r8] = d[1], 8
+         nop.f 0
+         br.ctop.dptk.few 0b
+         ;;
+       }
+       { .mii
+         mov ar.lc = r30
+         mov pr = r29, -1
+       }
+       { .bbb
+         br.ret.sptk.few rp
+       }
+       .endp xor_ia64_2
+
+       .proc xor_ia64_3
+xor_ia64_3:
+       .prologue
+       .fframe 0
+       { .mii
+         .save ar.pfs, r31
+         alloc r31 = ar.pfs, 4, 0, 20, 24
+         .save ar.lc, r30
+         mov r30 = ar.lc
+         .save pr, r29
+         mov r29 = pr
+         ;;
+       }
+       .body
+       { .mii
+         mov r8 = in1
+         mov ar.ec = 6 + 2
+         shr in0 = in0, 3
+         ;;
+       }
+       { .mmi
+         adds in0 = -1, in0
+         mov r16 = in1
+         mov r17 = in2
+         ;;
+       }
+       { .mii
+         mov r18 = in3
+         mov ar.lc = in0
+         mov pr.rot = 1 << 16
+         ;;
+       }
+       .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
+       .rotp p[6+2]
+0:     { .mmi
+(p[0])   ld8.nta s1[0] = [r16], 8
+(p[0])   ld8.nta s2[0] = [r17], 8
+(p[6])   xor d[0] = s1[6], s2[6]
+         ;;
+       }
+       { .mmi
+(p[0])   ld8.nta s3[0] = [r18], 8
+(p[6+1])  st8.nta [r8] = d[1], 8
+(p[6])   xor d[0] = d[0], s3[6]
+       }
+       { .bbb
+         br.ctop.dptk.few 0b
+         ;;
+       }
+       { .mii
+         mov ar.lc = r30
+         mov pr = r29, -1
+       }
+       { .bbb
+         br.ret.sptk.few rp
+       }
+       .endp xor_ia64_3
+
+       .proc xor_ia64_4
+xor_ia64_4:
+       .prologue
+       .fframe 0
+       { .mii
+         .save ar.pfs, r31
+         alloc r31 = ar.pfs, 5, 0, 27, 32
+         .save ar.lc, r30
+         mov r30 = ar.lc
+         .save pr, r29
+         mov r29 = pr
+         ;;
+       }
+       .body
+       { .mii
+         mov r8 = in1
+         mov ar.ec = 6 + 2
+         shr in0 = in0, 3
+         ;;
+       }
+       { .mmi
+         adds in0 = -1, in0
+         mov r16 = in1
+         mov r17 = in2
+         ;;
+       }
+       { .mii
+         mov r18 = in3
+         mov ar.lc = in0
+         mov pr.rot = 1 << 16
+       }
+       { .mfb
+         mov r19 = in4
+         ;;
+       }
+       .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
+       .rotp p[6+2]
+0:     { .mmi
+(p[0])   ld8.nta s1[0] = [r16], 8
+(p[0])   ld8.nta s2[0] = [r17], 8
+(p[6])   xor d[0] = s1[6], s2[6]
+       }
+       { .mmi
+(p[0])   ld8.nta s3[0] = [r18], 8
+(p[0])   ld8.nta s4[0] = [r19], 8
+(p[6])   xor r20 = s3[6], s4[6]
+         ;;
+       }
+       { .mib
+(p[6+1])  st8.nta [r8] = d[1], 8
+(p[6])   xor d[0] = d[0], r20
+         br.ctop.dptk.few 0b
+         ;;
+       }
+       { .mii
+         mov ar.lc = r30
+         mov pr = r29, -1
+       }
+       { .bbb
+         br.ret.sptk.few rp
+       }
+       .endp xor_ia64_4
+
+       .proc xor_ia64_5
+xor_ia64_5:
+       .prologue
+       .fframe 0
+       { .mii
+         .save ar.pfs, r31
+         alloc r31 = ar.pfs, 6, 0, 34, 40
+         .save ar.lc, r30
+         mov r30 = ar.lc
+         .save pr, r29
+         mov r29 = pr
+         ;;
+       }
+       .body
+       { .mii
+         mov r8 = in1
+         mov ar.ec = 6 + 2
+         shr in0 = in0, 3
+         ;;
+       }
+       { .mmi
+         adds in0 = -1, in0
+         mov r16 = in1
+         mov r17 = in2
+         ;;
+       }
+       { .mii
+         mov r18 = in3
+         mov ar.lc = in0
+         mov pr.rot = 1 << 16
+       }
+       { .mib
+         mov r19 = in4
+         mov r20 = in5
+         ;;
+       }
+       .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
+       .rotp p[6+2]
+0:     { .mmi
+(p[0])   ld8.nta s1[0] = [r16], 8
+(p[0])   ld8.nta s2[0] = [r17], 8
+(p[6])   xor d[0] = s1[6], s2[6]
+       }
+       { .mmi
+(p[0])   ld8.nta s3[0] = [r18], 8
+(p[0])   ld8.nta s4[0] = [r19], 8
+(p[6])   xor r21 = s3[6], s4[6]
+         ;;
+       }
+       { .mmi
+(p[0])   ld8.nta s5[0] = [r20], 8
+(p[6+1])  st8.nta [r8] = d[1], 8
+(p[6])   xor d[0] = d[0], r21
+         ;;
+       }
+       { .mfb
+(p[6])   xor d[0] = d[0], s5[6]
+         nop.f 0
+         br.ctop.dptk.few 0b
+         ;;
+       }
+       { .mii
+         mov ar.lc = r30
+         mov pr = r29, -1
+       }
+       { .bbb
+         br.ret.sptk.few rp
+       }
+       .endp xor_ia64_5
+");
+
+static struct xor_block_template xor_block_ia64 = {
+       name: "ia64",
+       do_2: xor_ia64_2,
+       do_3: xor_ia64_3,
+       do_4: xor_ia64_4,
+       do_5: xor_ia64_5,
+};
+
+#define XOR_TRY_TEMPLATES      xor_speed(&xor_block_ia64)
diff --git a/include/asm-m68k/xor.h b/include/asm-m68k/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-mips/xor.h b/include/asm-mips/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-mips64/xor.h b/include/asm-mips64/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-ppc/xor.h b/include/asm-ppc/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-s390/xor.h b/include/asm-s390/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-sh/xor.h b/include/asm-sh/xor.h
new file mode 100644 (file)
index 0000000..c82eb12
--- /dev/null
@@ -0,0 +1 @@
+#include <asm-generic/xor.h>
diff --git a/include/asm-sparc/xor.h b/include/asm-sparc/xor.h
new file mode 100644 (file)
index 0000000..f9af76a
--- /dev/null
@@ -0,0 +1,273 @@
+/*
+ * include/asm-sparc/xor.h
+ *
+ * Optimized RAID-5 checksumming functions for 32-bit Sparc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * ldd/std SPARC instructions.
+ *
+ * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+
+static void
+sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       int lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               __asm__ __volatile__("
+                 ldd [%0 + 0x00], %%g2
+                 ldd [%0 + 0x08], %%g4
+                 ldd [%0 + 0x10], %%o0
+                 ldd [%0 + 0x18], %%o2
+                 ldd [%1 + 0x00], %%o4
+                 ldd [%1 + 0x08], %%l0
+                 ldd [%1 + 0x10], %%l2
+                 ldd [%1 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 std %%g2, [%0 + 0x00]
+                 std %%g4, [%0 + 0x08]
+                 std %%o0, [%0 + 0x10]
+                 std %%o2, [%0 + 0x18]
+                 "
+               :
+               : "r" (p1), "r" (p2)
+               : "g2", "g3", "g4", "g5",
+                 "o0", "o1", "o2", "o3", "o4", "o5",
+                 "l0", "l1", "l2", "l3", "l4", "l5");
+               p1 += 8;
+               p2 += 8;
+       } while (--lines > 0);
+}
+
+static void
+sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+       unsigned long *p3)
+{
+       int lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               __asm__ __volatile__("
+                 ldd [%0 + 0x00], %%g2
+                 ldd [%0 + 0x08], %%g4
+                 ldd [%0 + 0x10], %%o0
+                 ldd [%0 + 0x18], %%o2
+                 ldd [%1 + 0x00], %%o4
+                 ldd [%1 + 0x08], %%l0
+                 ldd [%1 + 0x10], %%l2
+                 ldd [%1 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 ldd [%2 + 0x00], %%o4
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 ldd [%2 + 0x08], %%l0
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 ldd [%2 + 0x10], %%l2
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 ldd [%2 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 std %%g2, [%0 + 0x00]
+                 std %%g4, [%0 + 0x08]
+                 std %%o0, [%0 + 0x10]
+                 std %%o2, [%0 + 0x18]
+                 "
+               :
+               : "r" (p1), "r" (p2), "r" (p3)
+               : "g2", "g3", "g4", "g5",
+                 "o0", "o1", "o2", "o3", "o4", "o5",
+                 "l0", "l1", "l2", "l3", "l4", "l5");
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+       } while (--lines > 0);
+}
+
+static void
+sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+       unsigned long *p3, unsigned long *p4)
+{
+       int lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               __asm__ __volatile__("
+                 ldd [%0 + 0x00], %%g2
+                 ldd [%0 + 0x08], %%g4
+                 ldd [%0 + 0x10], %%o0
+                 ldd [%0 + 0x18], %%o2
+                 ldd [%1 + 0x00], %%o4
+                 ldd [%1 + 0x08], %%l0
+                 ldd [%1 + 0x10], %%l2
+                 ldd [%1 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 ldd [%2 + 0x00], %%o4
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 ldd [%2 + 0x08], %%l0
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 ldd [%2 + 0x10], %%l2
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 ldd [%2 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 ldd [%3 + 0x00], %%o4
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 ldd [%3 + 0x08], %%l0
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 ldd [%3 + 0x10], %%l2
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 ldd [%3 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 std %%g2, [%0 + 0x00]
+                 std %%g4, [%0 + 0x08]
+                 std %%o0, [%0 + 0x10]
+                 std %%o2, [%0 + 0x18]
+                 "
+               :
+               : "r" (p1), "r" (p2), "r" (p3), "r" (p4)
+               : "g2", "g3", "g4", "g5",
+                 "o0", "o1", "o2", "o3", "o4", "o5",
+                 "l0", "l1", "l2", "l3", "l4", "l5");
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+               p4 += 8;
+       } while (--lines > 0);
+}
+
+static void
+sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+       unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       int lines = bytes / (sizeof (long)) / 8;
+
+       do {
+               __asm__ __volatile__("
+                 ldd [%0 + 0x00], %%g2
+                 ldd [%0 + 0x08], %%g4
+                 ldd [%0 + 0x10], %%o0
+                 ldd [%0 + 0x18], %%o2
+                 ldd [%1 + 0x00], %%o4
+                 ldd [%1 + 0x08], %%l0
+                 ldd [%1 + 0x10], %%l2
+                 ldd [%1 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 ldd [%2 + 0x00], %%o4
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 ldd [%2 + 0x08], %%l0
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 ldd [%2 + 0x10], %%l2
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 ldd [%2 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 ldd [%3 + 0x00], %%o4
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 ldd [%3 + 0x08], %%l0
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 ldd [%3 + 0x10], %%l2
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 ldd [%3 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 ldd [%4 + 0x00], %%o4
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 ldd [%4 + 0x08], %%l0
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 ldd [%4 + 0x10], %%l2
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 ldd [%4 + 0x18], %%l4
+                 xor %%g2, %%o4, %%g2
+                 xor %%g3, %%o5, %%g3
+                 xor %%g4, %%l0, %%g4
+                 xor %%g5, %%l1, %%g5
+                 xor %%o0, %%l2, %%o0
+                 xor %%o1, %%l3, %%o1
+                 xor %%o2, %%l4, %%o2
+                 xor %%o3, %%l5, %%o3
+                 std %%g2, [%0 + 0x00]
+                 std %%g4, [%0 + 0x08]
+                 std %%o0, [%0 + 0x10]
+                 std %%o2, [%0 + 0x18]
+                 "
+               :
+               : "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
+               : "g2", "g3", "g4", "g5",
+                 "o0", "o1", "o2", "o3", "o4", "o5",
+                 "l0", "l1", "l2", "l3", "l4", "l5");
+               p1 += 8;
+               p2 += 8;
+               p3 += 8;
+               p4 += 8;
+               p5 += 8;
+       } while (--lines > 0);
+}
+
+static struct xor_block_template xor_block_SPARC = {
+       name: "SPARC",
+       do_2: sparc_2,
+       do_3: sparc_3,
+       do_4: sparc_4,
+       do_5: sparc_5,
+};
+
+/* For grins, also test the generic routines.  */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES                              \
+       do {                                            \
+               xor_speed(&xor_block_8regs);            \
+               xor_speed(&xor_block_32regs);           \
+               xor_speed(&xor_block_SPARC);            \
+       } while (0)
diff --git a/include/asm-sparc64/xor.h b/include/asm-sparc64/xor.h
new file mode 100644 (file)
index 0000000..0a3e1e8
--- /dev/null
@@ -0,0 +1,396 @@
+/*
+ * include/asm-sparc64/xor.h
+ *
+ * High speed xor_block operation for RAID4/5 utilizing the
+ * UltraSparc Visual Instruction Set.
+ *
+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ *     Requirements:
+ *     !(((long)dest | (long)sourceN) & (64 - 1)) &&
+ *     !(len & 127) && len >= 256
+ *
+ * It is done in pure assembly, as otherwise gcc makes it a non-leaf
+ * function, which is not what we want.
+ */
+
+#include <asm/pstate.h>
+#include <asm/asi.h>
+
+extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
+                     unsigned long *);
+extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
+                     unsigned long *, unsigned long *);
+extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
+                     unsigned long *, unsigned long *, unsigned long *);
+
+#define _S(x) __S(x)
+#define __S(x) #x
+#define DEF(x) __asm__(#x " = " _S(x))
+
+DEF(FPRS_FEF);
+DEF(FPRS_DU);
+DEF(ASI_BLK_P);
+
+/* ??? We set and use %asi instead of using ASI_BLK_P directly because gas
+   currently does not accept symbolic constants for the ASI specifier.  */
+
+__asm__ ("
+       .text
+       .globl xor_vis_2
+       .type xor_vis_2,@function
+xor_vis_2:
+       rd      %fprs, %g1
+       andcc   %g1, FPRS_FEF|FPRS_DU, %g0
+       be,pt   %icc, 0f
+        sethi  %hi(VISenter), %g1
+       jmpl    %g1 + %lo(VISenter), %g7
+        add    %g7, 8, %g7
+0:     wr      %g0, FPRS_FEF, %fprs
+       rd      %asi, %g1
+       wr      %g0, ASI_BLK_P, %asi
+       membar  #LoadStore|#StoreLoad|#StoreStore
+       sub     %o0, 128, %o0
+       ldda    [%o1] %asi, %f0
+       ldda    [%o2] %asi, %f16
+
+2:     ldda    [%o1 + 64] %asi, %f32
+       fxor    %f0, %f16, %f16
+       fxor    %f2, %f18, %f18
+       fxor    %f4, %f20, %f20
+       fxor    %f6, %f22, %f22
+       fxor    %f8, %f24, %f24
+       fxor    %f10, %f26, %f26
+       fxor    %f12, %f28, %f28
+       fxor    %f14, %f30, %f30
+       stda    %f16, [%o1] %asi
+       ldda    [%o2 + 64] %asi, %f48
+       ldda    [%o1 + 128] %asi, %f0
+       fxor    %f32, %f48, %f48
+       fxor    %f34, %f50, %f50
+       add     %o1, 128, %o1
+       fxor    %f36, %f52, %f52
+       add     %o2, 128, %o2
+       fxor    %f38, %f54, %f54
+       subcc   %o0, 128, %o0
+       fxor    %f40, %f56, %f56
+       fxor    %f42, %f58, %f58
+       fxor    %f44, %f60, %f60
+       fxor    %f46, %f62, %f62
+       stda    %f48, [%o1 - 64] %asi
+       bne,pt  %xcc, 2b
+        ldda   [%o2] %asi, %f16
+
+       ldda    [%o1 + 64] %asi, %f32
+       fxor    %f0, %f16, %f16
+       fxor    %f2, %f18, %f18
+       fxor    %f4, %f20, %f20
+       fxor    %f6, %f22, %f22
+       fxor    %f8, %f24, %f24
+       fxor    %f10, %f26, %f26
+       fxor    %f12, %f28, %f28
+       fxor    %f14, %f30, %f30
+       stda    %f16, [%o1] %asi
+       ldda    [%o2 + 64] %asi, %f48
+       membar  #Sync
+       fxor    %f32, %f48, %f48
+       fxor    %f34, %f50, %f50
+       fxor    %f36, %f52, %f52
+       fxor    %f38, %f54, %f54
+       fxor    %f40, %f56, %f56
+       fxor    %f42, %f58, %f58
+       fxor    %f44, %f60, %f60
+       fxor    %f46, %f62, %f62
+       stda    %f48, [%o1 + 64] %asi
+       membar  #Sync|#StoreStore|#StoreLoad
+       wr      %g1, %g0, %asi
+       retl
+         wr    %g0, 0, %fprs
+       .size xor_vis_2, .-xor_vis_2
+
+
+       .globl xor_vis_3
+       .type xor_vis_3,@function
+xor_vis_3:
+       rd      %fprs, %g1
+       andcc   %g1, FPRS_FEF|FPRS_DU, %g0
+       be,pt   %icc, 0f
+        sethi  %hi(VISenter), %g1
+       jmpl    %g1 + %lo(VISenter), %g7
+        add    %g7, 8, %g7
+0:     wr      %g0, FPRS_FEF, %fprs
+       rd      %asi, %g1
+       wr      %g0, ASI_BLK_P, %asi
+       membar  #LoadStore|#StoreLoad|#StoreStore
+       sub     %o0, 64, %o0
+       ldda    [%o1] %asi, %f0
+       ldda    [%o2] %asi, %f16
+
+3:     ldda    [%o3] %asi, %f32
+       fxor    %f0, %f16, %f48
+       fxor    %f2, %f18, %f50
+       add     %o1, 64, %o1
+       fxor    %f4, %f20, %f52
+       fxor    %f6, %f22, %f54
+       add     %o2, 64, %o2
+       fxor    %f8, %f24, %f56
+       fxor    %f10, %f26, %f58
+       fxor    %f12, %f28, %f60
+       fxor    %f14, %f30, %f62
+       ldda    [%o1] %asi, %f0
+       fxor    %f48, %f32, %f48
+       fxor    %f50, %f34, %f50
+       fxor    %f52, %f36, %f52
+       fxor    %f54, %f38, %f54
+       add     %o3, 64, %o3
+       fxor    %f56, %f40, %f56
+       fxor    %f58, %f42, %f58
+       subcc   %o0, 64, %o0
+       fxor    %f60, %f44, %f60
+       fxor    %f62, %f46, %f62
+       stda    %f48, [%o1 - 64] %asi
+       bne,pt  %xcc, 3b
+        ldda   [%o2] %asi, %f16
+
+       ldda    [%o3] %asi, %f32
+       fxor    %f0, %f16, %f48
+       fxor    %f2, %f18, %f50
+       fxor    %f4, %f20, %f52
+       fxor    %f6, %f22, %f54
+       fxor    %f8, %f24, %f56
+       fxor    %f10, %f26, %f58
+       fxor    %f12, %f28, %f60
+       fxor    %f14, %f30, %f62
+       membar  #Sync
+       fxor    %f48, %f32, %f48
+       fxor    %f50, %f34, %f50
+       fxor    %f52, %f36, %f52
+       fxor    %f54, %f38, %f54
+       fxor    %f56, %f40, %f56
+       fxor    %f58, %f42, %f58
+       fxor    %f60, %f44, %f60
+       fxor    %f62, %f46, %f62
+       stda    %f48, [%o1] %asi
+       membar  #Sync|#StoreStore|#StoreLoad
+       wr      %g1, %g0, %asi
+       retl
+        wr     %g0, 0, %fprs
+       .size xor_vis_3, .-xor_vis_3
+
+
+       .globl xor_vis_4
+       .type xor_vis_4,@function
+xor_vis_4:
+       rd      %fprs, %g1
+       andcc   %g1, FPRS_FEF|FPRS_DU, %g0
+       be,pt   %icc, 0f
+        sethi  %hi(VISenter), %g1
+       jmpl    %g1 + %lo(VISenter), %g7
+        add    %g7, 8, %g7
+0:     wr      %g0, FPRS_FEF, %fprs
+       rd      %asi, %g1
+       wr      %g0, ASI_BLK_P, %asi
+       membar  #LoadStore|#StoreLoad|#StoreStore
+       sub     %o0, 64, %o0
+       ldda    [%o1] %asi, %f0
+       ldda    [%o2] %asi, %f16
+
+4:     ldda    [%o3] %asi, %f32
+       fxor    %f0, %f16, %f16
+       fxor    %f2, %f18, %f18
+       add     %o1, 64, %o1
+       fxor    %f4, %f20, %f20
+       fxor    %f6, %f22, %f22
+       add     %o2, 64, %o2
+       fxor    %f8, %f24, %f24
+       fxor    %f10, %f26, %f26
+       fxor    %f12, %f28, %f28
+       fxor    %f14, %f30, %f30
+       ldda    [%o4] %asi, %f48
+       fxor    %f16, %f32, %f32
+       fxor    %f18, %f34, %f34
+       fxor    %f20, %f36, %f36
+       fxor    %f22, %f38, %f38
+       add     %o3, 64, %o3
+       fxor    %f24, %f40, %f40
+       fxor    %f26, %f42, %f42
+       fxor    %f28, %f44, %f44
+       fxor    %f30, %f46, %f46
+       ldda    [%o1] %asi, %f0
+       fxor    %f32, %f48, %f48
+       fxor    %f34, %f50, %f50
+       fxor    %f36, %f52, %f52
+       add     %o4, 64, %o4
+       fxor    %f38, %f54, %f54
+       fxor    %f40, %f56, %f56
+       fxor    %f42, %f58, %f58
+       subcc   %o0, 64, %o0
+       fxor    %f44, %f60, %f60
+       fxor    %f46, %f62, %f62
+       stda    %f48, [%o1 - 64] %asi
+       bne,pt  %xcc, 4b
+        ldda   [%o2] %asi, %f16
+
+       ldda    [%o3] %asi, %f32
+       fxor    %f0, %f16, %f16
+       fxor    %f2, %f18, %f18
+       fxor    %f4, %f20, %f20
+       fxor    %f6, %f22, %f22
+       fxor    %f8, %f24, %f24
+       fxor    %f10, %f26, %f26
+       fxor    %f12, %f28, %f28
+       fxor    %f14, %f30, %f30
+       ldda    [%o4] %asi, %f48
+       fxor    %f16, %f32, %f32
+       fxor    %f18, %f34, %f34
+       fxor    %f20, %f36, %f36
+       fxor    %f22, %f38, %f38
+       fxor    %f24, %f40, %f40
+       fxor    %f26, %f42, %f42
+       fxor    %f28, %f44, %f44
+       fxor    %f30, %f46, %f46
+       membar  #Sync
+       fxor    %f32, %f48, %f48
+       fxor    %f34, %f50, %f50
+       fxor    %f36, %f52, %f52
+       fxor    %f38, %f54, %f54
+       fxor    %f40, %f56, %f56
+       fxor    %f42, %f58, %f58
+       fxor    %f44, %f60, %f60
+       fxor    %f46, %f62, %f62
+       stda    %f48, [%o1] %asi
+       membar  #Sync|#StoreStore|#StoreLoad
+       wr      %g1, %g0, %asi
+       retl
+        wr     %g0, 0, %fprs
+       .size xor_vis_4, .-xor_vis_4
+
+
+       .globl xor_vis_5
+       .type xor_vis_5,@function
+xor_vis_5:
+       rd      %fprs, %g1
+       andcc   %g1, FPRS_FEF|FPRS_DU, %g0
+       be,pt   %icc, 0f
+        sethi  %hi(VISenter), %g1
+       jmpl    %g1 + %lo(VISenter), %g7
+        add    %g7, 8, %g7
+0:     wr      %g0, FPRS_FEF, %fprs
+       rd      %asi, %g1
+       wr      %g0, ASI_BLK_P, %asi
+       membar  #LoadStore|#StoreLoad|#StoreStore
+       sub     %o0, 64, %o0
+       ldda    [%o1] %asi, %f0
+       ldda    [%o2] %asi, %f16
+
+5:     ldda    [%o3] %asi, %f32
+       fxor    %f0, %f16, %f48
+       fxor    %f2, %f18, %f50
+       add     %o1, 64, %o1
+       fxor    %f4, %f20, %f52
+       fxor    %f6, %f22, %f54
+       add     %o2, 64, %o2
+       fxor    %f8, %f24, %f56
+       fxor    %f10, %f26, %f58
+       fxor    %f12, %f28, %f60
+       fxor    %f14, %f30, %f62
+       ldda    [%o4] %asi, %f16
+       fxor    %f48, %f32, %f48
+       fxor    %f50, %f34, %f50
+       fxor    %f52, %f36, %f52
+       fxor    %f54, %f38, %f54
+       add     %o3, 64, %o3
+       fxor    %f56, %f40, %f56
+       fxor    %f58, %f42, %f58
+       fxor    %f60, %f44, %f60
+       fxor    %f62, %f46, %f62
+       ldda    [%o5] %asi, %f32
+       fxor    %f48, %f16, %f48
+       fxor    %f50, %f18, %f50
+       add     %o4, 64, %o4
+       fxor    %f52, %f20, %f52
+       fxor    %f54, %f22, %f54
+       add     %o5, 64, %o5
+       fxor    %f56, %f24, %f56
+       fxor    %f58, %f26, %f58
+       fxor    %f60, %f28, %f60
+       fxor    %f62, %f30, %f62
+       ldda    [%o1] %asi, %f0
+       fxor    %f48, %f32, %f48
+       fxor    %f50, %f34, %f50
+       fxor    %f52, %f36, %f52
+       fxor    %f54, %f38, %f54
+       fxor    %f56, %f40, %f56
+       fxor    %f58, %f42, %f58
+       subcc   %o0, 64, %o0
+       fxor    %f60, %f44, %f60
+       fxor    %f62, %f46, %f62
+       stda    %f48, [%o1 - 64] %asi
+       bne,pt  %xcc, 5b
+        ldda   [%o2] %asi, %f16
+
+       ldda    [%o3] %asi, %f32
+       fxor    %f0, %f16, %f48
+       fxor    %f2, %f18, %f50
+       fxor    %f4, %f20, %f52
+       fxor    %f6, %f22, %f54
+       fxor    %f8, %f24, %f56
+       fxor    %f10, %f26, %f58
+       fxor    %f12, %f28, %f60
+       fxor    %f14, %f30, %f62
+       ldda    [%o4] %asi, %f16
+       fxor    %f48, %f32, %f48
+       fxor    %f50, %f34, %f50
+       fxor    %f52, %f36, %f52
+       fxor    %f54, %f38, %f54
+       fxor    %f56, %f40, %f56
+       fxor    %f58, %f42, %f58
+       fxor    %f60, %f44, %f60
+       fxor    %f62, %f46, %f62
+       ldda    [%o5] %asi, %f32
+       fxor    %f48, %f16, %f48
+       fxor    %f50, %f18, %f50
+       fxor    %f52, %f20, %f52
+       fxor    %f54, %f22, %f54
+       fxor    %f56, %f24, %f56
+       fxor    %f58, %f26, %f58
+       fxor    %f60, %f28, %f60
+       fxor    %f62, %f30, %f62
+       membar  #Sync
+       fxor    %f48, %f32, %f48
+       fxor    %f50, %f34, %f50
+       fxor    %f52, %f36, %f52
+       fxor    %f54, %f38, %f54
+       fxor    %f56, %f40, %f56
+       fxor    %f58, %f42, %f58
+       fxor    %f60, %f44, %f60
+       fxor    %f62, %f46, %f62
+       stda    %f48, [%o1] %asi
+       membar  #Sync|#StoreStore|#StoreLoad
+       wr      %g1, %g0, %asi
+       retl
+        wr     %g0, 0, %fprs
+       .size xor_vis_5, .-xor_vis_5
+");
+
+static struct xor_block_template xor_block_VIS = {
+        name: "VIS",
+        do_2: xor_vis_2,
+        do_3: xor_vis_3,
+        do_4: xor_vis_4,
+        do_5: xor_vis_5,
+};
+
+#define XOR_TRY_TEMPLATES       xor_speed(&xor_block_VIS)
index 6ef5b4c..f367280 100644 (file)
@@ -73,7+73,7 @@ extern struct kernel_param __setup_start, __setup_end;
  * Mark functions and data as being only used at initialization
  * or exit time.
  */
-#define __init         __attribute__ ((__section__ (".text.init")))
+#define __init         /* __attribute__ ((__section__ (".text.init"))) */
 #define __exit         __attribute__ ((unused, __section__(".text.exit")))
 #define __initdata     __attribute__ ((__section__ (".data.init")))
 #define __exitdata     __attribute__ ((unused, __section__ (".data.exit")))
dissimilarity index 70%
index c8034b7..0e6950a 100644 (file)
-#ifndef _XOR_H
-#define _XOR_H
-
-#include <linux/raid/md.h>
-
-#define MAX_XOR_BLOCKS 4
-
-extern void calibrate_xor_block(void);
-extern void (*xor_block)(unsigned int count,
-                         struct buffer_head **bh_ptr);
-
-#endif
+#ifndef _XOR_H
+#define _XOR_H
+
+#include <linux/raid/md.h>
+
+#define MAX_XOR_BLOCKS 5
+
+extern void xor_block(unsigned int count, struct buffer_head **bh_ptr);
+
+struct xor_block_template {
+        struct xor_block_template *next;
+        const char *name;
+        int speed;
+       void (*do_2)(unsigned long, unsigned long *, unsigned long *);
+       void (*do_3)(unsigned long, unsigned long *, unsigned long *,
+                    unsigned long *);
+       void (*do_4)(unsigned long, unsigned long *, unsigned long *,
+                    unsigned long *, unsigned long *);
+       void (*do_5)(unsigned long, unsigned long *, unsigned long *,
+                    unsigned long *, unsigned long *, unsigned long *);
+};
+
+#endif
index c8f4740..c5db140 100644 (file)
@@ -486,10+486,6 @@ EXPORT_SYMBOL(remove_inode_hash);
 EXPORT_SYMBOL(make_bad_inode);
 EXPORT_SYMBOL(is_bad_inode);
 EXPORT_SYMBOL(event);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
 EXPORT_SYMBOL(brw_page);
 
 #ifdef CONFIG_UID16
index f54480e..8e7455d 100644 (file)
@@ -433,15+433,27 @@ static inline void __schedule_tail(struct task_struct *prev)
        int policy;
 
        /*
+        * prev->policy can be written from here only before `prev'
+        * can be scheduled (before setting prev->has_cpu to zero).
+        * Of course it must also be read before allowing prev
+        * to be rescheduled, but since the write depends on the read
+        * to complete, wmb() is enough. (the spin_lock() acquired
+        * before setting has_cpu is not enough because the spin_lock()
+        * common code semantics allows code outside the critical section
+        * to enter inside the critical section)
+        */
+       policy = prev->policy;
+       prev->policy = policy & ~SCHED_YIELD;
+       wmb();
+
+       /*
         * fast path falls through. We have to clear has_cpu before
         * checking prev->state to avoid a wakeup race - thus we
         * also have to protect against the task exiting early.
         */
        task_lock(prev);
-       policy = prev->policy;
-       prev->policy = policy & ~SCHED_YIELD;
        prev->has_cpu = 0;
-       wmb();
+       mb();
        if (prev->state == TASK_RUNNING)
                goto needs_resched;
 
close