Index: arch/parisc/kernel/pacache.S =================================================================== RCS file: /var/cvs/linux-2.6/arch/parisc/kernel/pacache.S,v retrieving revision 1.13 diff -u -p -r1.13 pacache.S --- arch/parisc/kernel/pacache.S 19 Dec 2004 04:50:35 -0000 1.13 +++ arch/parisc/kernel/pacache.S 29 Dec 2004 05:37:46 -0000 @@ -295,17 +295,72 @@ copy_user_page_asm: .callinfo NO_CALLS .entry - ldi 64, %r1 +#ifdef __LP64__ + /* PA8x00 CPUs can consume 2 loads and 2 stores per cycle. + * Unroll the loop by hand and arrange insn appropriately. + * GCC probably can do this just as well. + * + * Prefetching and using more regs to increase the "distance" + * between ldd and corresponding std are possible optimizations. + */ + + ldi 32, %r1 /* PAGE_SIZE/128 == 32 */ + +1: ldd 0(%r25), %r19 /* prolog == 1 bundle */ + ldd 8(%r25), %r20 + + ldd 16(%r25), %r21 /* bundle 2 */ + ldd 24(%r25), %r22 + std %r19, 0(%r26) + std %r20, 8(%r26) + + ldd 32(%r25), %r19 /* bundle 3 */ + ldd 40(%r25), %r20 + std %r21, 16(%r26) + std %r22, 24(%r26) + + ldd 48(%r25), %r21 /* bundle 4 */ + ldd 56(%r25), %r22 + std %r19, 32(%r26) + std %r20, 40(%r26) + + ldd 64(%r25), %r19 /* bundle 5 */ + ldd 72(%r25), %r20 + std %r21, 48(%r26) + std %r22, 56(%r26) + + ldd 80(%r25), %r21 /* bundle 6 */ + ldd 88(%r25), %r22 + std %r19, 64(%r26) + std %r20, 72(%r26) + + ldd 96(%r25), %r19 /* bundle 7 */ + ldd 104(%r25), %r20 + std %r21, 80(%r26) + std %r22, 88(%r26) + + ldd 112(%r25), %r21 /* bundle 8 */ + ldd 120(%r25), %r22 + std %r19, 96(%r26) + std %r20, 104(%r26) + + ldo 128(%r25), %r25 /* epilog == 2 bundles */ + std %r21, 112(%r26) + std %r22, 120(%r26) + + ADDIB> -1, %r1, 1b + ldo 128(%r26), %r26 + +#else /* * This loop is optimized for PCXL/PCXL2 ldw/ldw and stw/stw - * bundles (very restricted rules for bundling). It probably - * does OK on PCXU and better, but we could do better with - * ldd/std instructions. Note that until (if) we start saving + * bundles (very restricted rules for bundling). + * Note that until (if) we start saving * the full 64 bit register values on interrupt, we can't * use ldd/std on a 32 bit kernel. */ - + ldi 64, %r1 /* PAGE_SIZE/64 == 64 */ 1: ldw 0(%r25), %r19 @@ -343,7 +398,7 @@ copy_user_page_asm: ldo 64(%r26), %r26 ADDIB> -1, %r1, 1b ldo 64(%r25), %r25 - +#endif bv %r0(%r2) nop .exit