src - FreeBSD source tree

diff options


context:
space:
mode:

author	Jake Burkholder <jake@FreeBSD.org>	2001-09-30 19:50:39 +0000
committer	Jake Burkholder <jake@FreeBSD.org>	2001-09-30 19:50:39 +0000
commit	fa753b0bcb1bb13db96c56cd44e61874b87fe7b8 (patch)
tree	40bc310af8ed7c3091d02056f1bc477dac4f4bbd /sys
parent	c4bc2cc71445a9f8a7c21aa1dfd173d917bb5bd1 (diff)
download	src-fa753b0bcb1bb13db96c56cd44e61874b87fe7b8.tar.gz src-fa753b0bcb1bb13db96c56cd44e61874b87fe7b8.zip

Optimize bcopy and bzero etc to use 64 bit loads and stores if possible.

Handle overlap in bcopy. Add routines for copying and zeroing pages using physical addresses directly. Remove all the hacks to account for calling the firmware on its own trap table, we use the kernel trap table. There is still a problem with OF_exit().

Notes

Notes: svn path=/head/; revision=84193

Diffstat (limited to 'sys')

-rw-r--r--

sys/sparc64/sparc64/support.S

353

-rw-r--r--

sys/sparc64/sparc64/support.s

353

2 files changed, 474 insertions, 232 deletions

diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S
index 41469f51a791..bbd2d132bb67 100644
--- a/sys/sparc64/sparc64/support.S
+++ b/sys/sparc64/sparc64/support.S

@@ -33,62 +33,184 @@

#include "assym.s"

-#define E

+#define E /* empty */

+/*

+ * Generate load and store instructions for the corresponding width and asi

+ * (or not). Note that we want to evaluate the macro args before

+ * concatenating, so that E really turns into nothing.

+ */

#define _LD(w, a) ld ## w ## a

#define _ST(w, a) st ## w ## a

#define LD(w, a) _LD(w, a)

#define ST(w, a) _ST(w, a)

-#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \

- brz,pn len, 2f ; \

- mov len, %o3 ; \

-1: LD(ub, sa) [src] sasi, %o4 ; \

- ST(b, da) %o4, [dst] dasi ; \

- dec %o3 ; \

- inc src ; \

- brnz,pt %o3, 1b ; \

- inc dst ; \

-2:

-#define BCOPY(src, dst, len) \

- _BCOPY(src, dst, len, E, E, E, E)

-#define COPYIN(uaddr, kaddr, len) \

- wr %g0, ASI_AIUP, %asi ; \

- _BCOPY(uaddr, kaddr, len, a, %asi, E, E)

-#define COPYOUT(kaddr, uaddr, len) \

- wr %g0, ASI_AIUP, %asi ; \

- _BCOPY(kaddr, uaddr, len, E, E, a, %asi)

+/*

+ * Common code for copy routines.

+ *

+ * We use large macros to generate functions for each of the copy routines.

+ * This allows the load and store instructions to be generated for the right

+ * operation, asi or not. It is possible to write an asi independent function

+ * but this would require 2 expensive wrs in the main loop to switch %asi.

+ * It would also screw up profiling (if we ever get it), but may save some I$.

+ * We assume that either one of dasi and sasi is empty, or that they are both

+ * the same (empty or non-empty). It is up to the caller to set %asi.

+ */

+/*

+ * ASI independent implementation of copystr(9).

+ * Used to implement copyinstr() and copystr().

+ *

+ * Return value is in %g1.

+ */

#define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \

- clr %o4 ; \

- clr %o5 ; \

-1: LD(ub, sa) [src] sasi, %g1 ; \

+ brz len, 4f ; \

+ mov src, %g2 ; \

+1: deccc 1, len ; \

+ bl,a,pn %xcc, 3f ; \

+ nop ; \

+ LD(ub, sa) [src] sasi, %g1 ; \

ST(b, da) %g1, [dst] dasi ; \

- brz,pn %g1, 2f ; \

- inc %o4 ; \

- dec len ; \

- inc src ; \

- brgz,pt len, 1b ; \

+ brz,pn %g1, 3f ; \

+ inc src ; \

+ b %xcc, 1b ; \

inc dst ; \

- mov ENAMETOOLONG, %o5 ; \

-2: brnz,a done, 3f ; \

- stx %o4, [done] ; \

-3:

+2: mov ENAMETOOLONG, %g1 ; \

+3: sub src, %g2, %g2 ; \

+ brnz,a done, 4f ; \

+ stx %g2, [done] ; \

+4:

-#define COPYSTR(dst, src, len, done) \

- _COPYSTR(dst, src, len, done, E, E, E, E)

+/*

+ * ASI independent implementation of memset(3).

+ * Used to implement bzero(), memset() and physzero().

+ *

+ * If the pattern is non-zero, duplicate it to fill 64 bits.

+ * Store bytes until dst is 8-byte aligned, then store 8 bytes.

+ * It has yet to be determined how much unrolling is beneficial.

+ * Could also read and compare before writing to minimize snoop traffic.

+ *

+ * XXX bzero() should be implemented as

+ * #define bzero(dst, len) (void)memset((dst), 0, (len))

+ * if at all.

+ */

+#define _MEMSET(dst, pat, len, da, dasi) \

+ brlez,pn len, 5f ; \

+ and pat, 0xff, pat ; \

+ brz,pt pat, 1f ; \

+ sllx pat, 8, %g1 ; \

+ or pat, %g1, pat ; \

+ sllx pat, 16, %g1 ; \

+ or pat, %g1, pat ; \

+ sllx pat, 32, %g1 ; \

+ or pat, %g1, pat ; \

+ .align 16 ; \

+1: deccc 1, len ; \

+ bl,pn %xcc, 5f ; \

+ btst 7, dst ; \

+ bz,a,pt %xcc, 2f ; \

+ inc 1, len ; \

+ ST(b, da) pat, [dst] dasi ; \

+ b %xcc, 1b ; \

+ inc dst ; \

+ .align 16 ; \

+2: deccc 32, len ; \

+ bl,a,pn %xcc, 3f ; \

+ inc 32, len ; \

+ ST(x, da) pat, [dst] dasi ; \

+ ST(x, da) pat, [dst + 8] dasi ; \

+ ST(x, da) pat, [dst + 16] dasi ; \

+ ST(x, da) pat, [dst + 24] dasi ; \

+ b %xcc, 2b ; \

+ inc 32, dst ; \

+ .align 16 ; \

+3: deccc 8, len ; \

+ bl,a,pn %xcc, 4f ; \

+ inc 8, len ; \

+ ST(x, da) pat, [dst] dasi ; \

+ b %xcc, 3b ; \

+ inc 8, dst ; \

+ .align 16 ; \

+4: deccc 1, len ; \

+ bl,a,pn %xcc, 5f ; \

+ nop ; \

+ ST(b, da) pat, [dst] dasi ; \

+ b %xcc, 4b ; \

+ inc 1, dst ; \

+5:

-#define COPYINSTR(uaddr, kaddr, len, done) \

- wr %g0, ASI_AIUP, %asi ; \

- _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)

+/*

+ * ASI independent implementation of memcpy(3).

+ * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().

+ *

+ * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte

+ * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned

+ * case could be optimized, but it is expected that this is the uncommon

+ * case and of questionable value. The code to do so is also rather large

+ * and ugly.

+ * It has yet to be determined how much unrolling is beneficial.

+ *

+ * XXX bcopy() must also check for overlap. This is stupid.

+ * XXX bcopy() should be implemented as

+ * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))

+ * if at all.

+ */

+#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \

+1: deccc 1, len ; \

+ bl,pn %xcc, 6f ; \

+ btst 7, dst ; \

+ bz,a,pt %xcc, 2f ; \

+ inc 1, len ; \

+ LD(ub, sa) [src] sasi, %g1 ; \

+ ST(b, da) %g1, [dst] dasi ; \

+ inc 1, src ; \

+ b %xcc, 1b ; \

+ inc 1, dst ; \

+ .align 16 ; \

+2: btst 7, src ; \

+ bz,a,pt %xcc, 3f ; \

+ nop ; \

+ b,a %xcc, 5f ; \

+ .align 16 ; \

+3: deccc 32, len ; \

+ bl,a,pn %xcc, 4f ; \

+ inc 32, len ; \

+ LD(x, sa) [src] sasi, %g1 ; \

+ LD(x, sa) [src + 8] sasi, %g2 ; \

+ LD(x, sa) [src + 16] sasi, %g3 ; \

+ LD(x, sa) [src + 24] sasi, %g4 ; \

+ ST(x, da) %g1, [dst] dasi ; \

+ ST(x, da) %g2, [dst + 8] dasi ; \

+ ST(x, da) %g3, [dst + 16] dasi ; \

+ ST(x, da) %g4, [dst + 24] dasi ; \

+ inc 32, src ; \

+ b %xcc, 3b ; \

+ inc 32, dst ; \

+ .align 16 ; \

+4: deccc 8, len ; \

+ bl,a,pn %xcc, 5f ; \

+ inc 8, len ; \

+ LD(x, sa) [src] sasi, %g1 ; \

+ ST(x, da) %g1, [dst] dasi ; \

+ inc 8, src ; \

+ b %xcc, 4b ; \

+ inc 8, dst ; \

+ .align 16 ; \

+5: deccc 1, len ; \

+ bl,a,pn %xcc, 6f ; \

+ nop ; \

+ LD(ub, sa) [src] sasi, %g1 ; \

+ ST(b, da) %g1, [dst] dasi ; \

+ inc src ; \

+ b %xcc, 5b ; \

+ inc dst ; \

+6:

#define CATCH_SETUP(label) \

setx label, %g2, %g1 ; \

- ldx [PCPU(CURPCB)], %g6 ; \

+ ldx [PCPU(CURTHREAD)], %g6 ; \

+ ldx [%g6 + TD_PCB], %g6 ; \

stx %g1, [%g6 + PCB_ONFAULT] ;

#define CATCH_END() \

@@ -119,7 +241,7 @@

SU_ALIGNED(storer, label)

- * void bcmp(void *b, size_t len)

+ * int bcmp(const void *b1, const void *b2, size_t len)

ENTRY(bcmp)

brz,pn %o2, 2f

@@ -127,7 +249,7 @@ ENTRY(bcmp)

1: ldub [%o0 + %o3], %o4

ldub [%o1 + %o3], %o5

cmp %o4, %o5

- bne,pn %xcc, 1f

+ bne,pn %xcc, 2f

inc %o3

deccc %o2

bne,pt %xcc, 1b

@@ -139,46 +261,90 @@ END(bcmp)

* void bcopy(const void *src, void *dst, size_t len)

+ENTRY(ovbcopy)

ENTRY(bcopy)

- BCOPY(%o0, %o1, %o2)

+ /*

+ * Check for overlap, and copy backwards if so.

+ */

+ sub %o1, %o0, %g1

+ cmp %g1, %o2

+ bgeu,a,pt %xcc, 3f

+ nop

+ /*

+ * Copy backwards.

+ */

+ add %o0, %o2, %o0

+ add %o1, %o2, %o1

+1: deccc 1, %o2

+ bl,a,pn %xcc, 2f

+ nop

+ dec 1, %o0

+ ldub [%o0], %g1

+ dec 1, %o1

+ b %xcc, 1b

+ stb %g1, [%o1]

+2: retl

+ nop

+ /*

+ * Do the fast version.

+ */

+3: _MEMCPY(%o1, %o0, %o2, E, E, E, E)

retl

nop

END(bcopy)

- * void ovbcopy(const void *src, void *dst, size_t len)

- * XXX handle overlap...

+ * void bzero(void *b, size_t len)

-ENTRY(ovbcopy)

- BCOPY(%o0, %o1, %o2)

+ENTRY(bzero)

+ _MEMSET(%o0, %g0, %o1, E, E)

retl

nop

-END(ovbcopy)

+END(bzero)

- * void bzero(void *b, size_t len)

+ * void physzero(vm_offset_t pa, size_t len)

-ENTRY(bzero)

- brz,pn %o1, 1f

+ENTRY(physzero)

+ wr %g0, ASI_PHYS_USE_EC, %asi

+ _MEMSET(%o0, %g0, %o1, a, %asi)

+ retl

nop

-1: deccc %o1

- stb %g0, [%o0]

- bne,pt %xcc, 1b

- inc %o0

-2: retl

+END(physzero)

+/*

+ * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)

+ */

+ENTRY(physcopy)

+ wr %g0, ASI_PHYS_USE_EC, %asi

+ _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)

+ retl

nop

-END(bzero)

+END(physcopy)

* void *memcpy(void *dst, const void *src, size_t len)

ENTRY(memcpy)

- BCOPY(%o1, %o0, %o2)

+ mov %o0, %o3

+ _MEMCPY(%o3, %o1, %o2, E, E, E, E)

retl

nop

END(memcpy)

+ * void *memset(void *b, int c, size_t len)

+ */

+ENTRY(memset)

+ mov %o0, %o3

+ _MEMSET(%o3, %o1, %o2, E, E)

+ retl

+ nop

+END(memset)

+/*

* int copyin(const void *uaddr, void *kaddr, size_t len)

ENTRY(copyin)

@@ -191,7 +357,8 @@ ENTRY(copyin)

stx %o2, [%o3 + KTR_PARM3]

#endif

- COPYIN(%o0, %o1, %o2)

+ wr %g0, ASI_AIUP, %asi

+ _MEMCPY(%o1, %o0, %o2, E, E, a, %asi)

CATCH_END()

retl

clr %o0

@@ -211,10 +378,11 @@ ENTRY(copyinstr)

stx %o3, [%g1 + KTR_PARM4]

#endif

- COPYINSTR(%o0, %o1, %o2, %o3)

+ wr %g0, ASI_AIUP, %asi

+ _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)

CATCH_END()

retl

- mov %o5, %o0

+ mov %g1, %o0

END(copyinstr)

@@ -230,7 +398,8 @@ ENTRY(copyout)

stx %o2, [%o3 + KTR_PARM3]

#endif

- COPYOUT(%o0, %o1, %o2)

+ wr %g0, ASI_AIUP, %asi

+ _MEMCPY(%o1, %o0, %o2, a, %asi, E, E)

CATCH_END()

retl

clr %o0

@@ -250,9 +419,9 @@ END(copyout)

* int copystr(const void *src, void *dst, size_t len, size_t *done)

ENTRY(copystr)

- COPYSTR(%o0, %o1, %o2, %o3)

+ _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)

retl

- mov %o5, %o0

+ mov %g1, %o0

END(copystr)

@@ -325,7 +494,6 @@ ENTRY(fsbail)

.Lfsalign:

retl

mov -1, %o0

-END(fsbail)

ENTRY(longjmp)

set 1, %g3

@@ -355,64 +523,17 @@ ENTRY(setjmp)

END(setjmp)

- * Temporary stack for calling into the firmware. We need to setup one, because

- * the MMU mapping for our stack page may be lost. When the firmware tries to

- * spill the last window (the others are flushed before), this results in an

- * DMMU miss trap, which is fatal with the firmware trap handlers installed.

- * Additionally, it seems that the firmware does not immediately switch to an

- * own stack (or maybe never?), therefore more space needs to be reserved.

- * I hope this is sufficient now.

- */

- .align 4

-DATA(ofwstack)

- .rept CCFSZ * 8

- .byte 0

- .endr

-ofwstack_last:

- .rept CCFSZ

- .byte 0

- .endr

-END(ofwstack)

-/*

* void openfirmware(cell_t args[])

ENTRY(openfirmware)

- /*

- * Disable interrupts. The firmware should not deal with our interrupts

- * anyway, and the temporary stack is not large enough to hold the stack

- * footprint of the interrrupt handling.

- */

- rdpr %pstate, %o3

- andn %o3, PSTATE_IE, %o1

- wrpr %o1, 0, %pstate

- setx ofwstack_last - SPOFF, %o1, %o2

- save %o2, 0, %sp

- flushw

- rdpr %tl, %l1

- rdpr %tba, %l2

- mov AA_DMMU_PCXR, %l3

- ldxa [%l3] ASI_DMMU, %l4

- stxa %g0, [%l3] ASI_DMMU

- membar #Sync

- flush %sp

- setx ofw_tba, %l7, %l5

- ldx [%l5], %l5

+ save %sp, -CCFSZ, %sp

setx ofw_vec, %l7, %l6

ldx [%l6], %l6

rdpr %pil, %l7

- wrpr %g0, 14, %pil

- wrpr %l5, 0, %tba

- wrpr %g0, 0, %tl

+ wrpr %g0, PIL_TICK, %pil

call %l6

mov %i0, %o0

- wrpr %l1, 0, %tl

- wrpr %l2, 0, %tba

- stxa %l4, [%l3] ASI_DMMU

wrpr %l7, 0, %pil

- membar #Sync

- flush %sp

- restore

- retl

- wrpr %o3, 0, %pstate

+ ret

+ restore %o0, %g0, %o0

END(openfirmware)

diff --git a/sys/sparc64/sparc64/support.s b/sys/sparc64/sparc64/support.s
index 41469f51a791..bbd2d132bb67 100644
--- a/sys/sparc64/sparc64/support.s
+++ b/sys/sparc64/sparc64/support.s

@@ -33,62 +33,184 @@

#include "assym.s"

-#define E

+#define E /* empty */

+/*

+ * Generate load and store instructions for the corresponding width and asi

+ * (or not). Note that we want to evaluate the macro args before

+ * concatenating, so that E really turns into nothing.

+ */

#define _LD(w, a) ld ## w ## a

#define _ST(w, a) st ## w ## a

#define LD(w, a) _LD(w, a)

#define ST(w, a) _ST(w, a)

-#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \

- brz,pn len, 2f ; \

- mov len, %o3 ; \

-1: LD(ub, sa) [src] sasi, %o4 ; \

- ST(b, da) %o4, [dst] dasi ; \

- dec %o3 ; \

- inc src ; \

- brnz,pt %o3, 1b ; \

- inc dst ; \

-2:

-#define BCOPY(src, dst, len) \

- _BCOPY(src, dst, len, E, E, E, E)

-#define COPYIN(uaddr, kaddr, len) \

- wr %g0, ASI_AIUP, %asi ; \

- _BCOPY(uaddr, kaddr, len, a, %asi, E, E)

-#define COPYOUT(kaddr, uaddr, len) \

- wr %g0, ASI_AIUP, %asi ; \

- _BCOPY(kaddr, uaddr, len, E, E, a, %asi)

+/*

+ * Common code for copy routines.

+ *

+ * We use large macros to generate functions for each of the copy routines.

+ * This allows the load and store instructions to be generated for the right

+ * operation, asi or not. It is possible to write an asi independent function

+ * but this would require 2 expensive wrs in the main loop to switch %asi.

+ * It would also screw up profiling (if we ever get it), but may save some I$.

+ * We assume that either one of dasi and sasi is empty, or that they are both

+ * the same (empty or non-empty). It is up to the caller to set %asi.

+ */

+/*

+ * ASI independent implementation of copystr(9).

+ * Used to implement copyinstr() and copystr().

+ *

+ * Return value is in %g1.

+ */

#define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \

- clr %o4 ; \

- clr %o5 ; \

-1: LD(ub, sa) [src] sasi, %g1 ; \

+ brz len, 4f ; \

+ mov src, %g2 ; \

+1: deccc 1, len ; \

+ bl,a,pn %xcc, 3f ; \

+ nop ; \

+ LD(ub, sa) [src] sasi, %g1 ; \

ST(b, da) %g1, [dst] dasi ; \

- brz,pn %g1, 2f ; \

- inc %o4 ; \

- dec len ; \

- inc src ; \

- brgz,pt len, 1b ; \

+ brz,pn %g1, 3f ; \

+ inc src ; \

+ b %xcc, 1b ; \

inc dst ; \

- mov ENAMETOOLONG, %o5 ; \

-2: brnz,a done, 3f ; \

- stx %o4, [done] ; \

-3:

+2: mov ENAMETOOLONG, %g1 ; \

+3: sub src, %g2, %g2 ; \

+ brnz,a done, 4f ; \

+ stx %g2, [done] ; \

+4:

-#define COPYSTR(dst, src, len, done) \

- _COPYSTR(dst, src, len, done, E, E, E, E)

+/*

+ * ASI independent implementation of memset(3).

+ * Used to implement bzero(), memset() and physzero().

+ *

+ * If the pattern is non-zero, duplicate it to fill 64 bits.

+ * Store bytes until dst is 8-byte aligned, then store 8 bytes.

+ * It has yet to be determined how much unrolling is beneficial.

+ * Could also read and compare before writing to minimize snoop traffic.

+ *

+ * XXX bzero() should be implemented as

+ * #define bzero(dst, len) (void)memset((dst), 0, (len))

+ * if at all.

+ */

+#define _MEMSET(dst, pat, len, da, dasi) \

+ brlez,pn len, 5f ; \

+ and pat, 0xff, pat ; \

+ brz,pt pat, 1f ; \

+ sllx pat, 8, %g1 ; \

+ or pat, %g1, pat ; \

+ sllx pat, 16, %g1 ; \

+ or pat, %g1, pat ; \

+ sllx pat, 32, %g1 ; \

+ or pat, %g1, pat ; \

+ .align 16 ; \

+1: deccc 1, len ; \

+ bl,pn %xcc, 5f ; \

+ btst 7, dst ; \

+ bz,a,pt %xcc, 2f ; \

+ inc 1, len ; \

+ ST(b, da) pat, [dst] dasi ; \

+ b %xcc, 1b ; \

+ inc dst ; \

+ .align 16 ; \

+2: deccc 32, len ; \

+ bl,a,pn %xcc, 3f ; \

+ inc 32, len ; \

+ ST(x, da) pat, [dst] dasi ; \

+ ST(x, da) pat, [dst + 8] dasi ; \

+ ST(x, da) pat, [dst + 16] dasi ; \

+ ST(x, da) pat, [dst + 24] dasi ; \

+ b %xcc, 2b ; \

+ inc 32, dst ; \

+ .align 16 ; \

+3: deccc 8, len ; \

+ bl,a,pn %xcc, 4f ; \

+ inc 8, len ; \

+ ST(x, da) pat, [dst] dasi ; \

+ b %xcc, 3b ; \

+ inc 8, dst ; \

+ .align 16 ; \

+4: deccc 1, len ; \

+ bl,a,pn %xcc, 5f ; \

+ nop ; \

+ ST(b, da) pat, [dst] dasi ; \

+ b %xcc, 4b ; \

+ inc 1, dst ; \

+5:

-#define COPYINSTR(uaddr, kaddr, len, done) \

- wr %g0, ASI_AIUP, %asi ; \

- _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)

+/*

+ * ASI independent implementation of memcpy(3).

+ * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().

+ *

+ * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte

+ * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned

+ * case could be optimized, but it is expected that this is the uncommon

+ * case and of questionable value. The code to do so is also rather large

+ * and ugly.

+ * It has yet to be determined how much unrolling is beneficial.

+ *

+ * XXX bcopy() must also check for overlap. This is stupid.

+ * XXX bcopy() should be implemented as

+ * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))

+ * if at all.

+ */

+#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \

+1: deccc 1, len ; \

+ bl,pn %xcc, 6f ; \

+ btst 7, dst ; \

+ bz,a,pt %xcc, 2f ; \

+ inc 1, len ; \

+ LD(ub, sa) [src] sasi, %g1 ; \

+ ST(b, da) %g1, [dst] dasi ; \

+ inc 1, src ; \

+ b %xcc, 1b ; \

+ inc 1, dst ; \

+ .align 16 ; \

+2: btst 7, src ; \

+ bz,a,pt %xcc, 3f ; \

+ nop ; \

+ b,a %xcc, 5f ; \

+ .align 16 ; \

+3: deccc 32, len ; \

+ bl,a,pn %xcc, 4f ; \

+ inc 32, len ; \

+ LD(x, sa) [src] sasi, %g1 ; \

+ LD(x, sa) [src + 8] sasi, %g2 ; \

+ LD(x, sa) [src + 16] sasi, %g3 ; \

+ LD(x, sa) [src + 24] sasi, %g4 ; \

+ ST(x, da) %g1, [dst] dasi ; \

+ ST(x, da) %g2, [dst + 8] dasi ; \

+ ST(x, da) %g3, [dst + 16] dasi ; \

+ ST(x, da) %g4, [dst + 24] dasi ; \

+ inc 32, src ; \

+ b %xcc, 3b ; \

+ inc 32, dst ; \

+ .align 16 ; \

+4: deccc 8, len ; \

+ bl,a,pn %xcc, 5f ; \

+ inc 8, len ; \

+ LD(x, sa) [src] sasi, %g1 ; \

+ ST(x, da) %g1, [dst] dasi ; \

+ inc 8, src ; \

+ b %xcc, 4b ; \

+ inc 8, dst ; \

+ .align 16 ; \

+5: deccc 1, len ; \

+ bl,a,pn %xcc, 6f ; \

+ nop ; \

+ LD(ub, sa) [src] sasi, %g1 ; \

+ ST(b, da) %g1, [dst] dasi ; \

+ inc src ; \

+ b %xcc, 5b ; \

+ inc dst ; \

+6:

#define CATCH_SETUP(label) \

setx label, %g2, %g1 ; \

- ldx [PCPU(CURPCB)], %g6 ; \

+ ldx [PCPU(CURTHREAD)], %g6 ; \

+ ldx [%g6 + TD_PCB], %g6 ; \

stx %g1, [%g6 + PCB_ONFAULT] ;

#define CATCH_END() \

@@ -119,7 +241,7 @@

SU_ALIGNED(storer, label)

- * void bcmp(void *b, size_t len)

+ * int bcmp(const void *b1, const void *b2, size_t len)

ENTRY(bcmp)

brz,pn %o2, 2f

@@ -127,7 +249,7 @@ ENTRY(bcmp)

1: ldub [%o0 + %o3], %o4

ldub [%o1 + %o3], %o5

cmp %o4, %o5

- bne,pn %xcc, 1f

+ bne,pn %xcc, 2f

inc %o3

deccc %o2

bne,pt %xcc, 1b

@@ -139,46 +261,90 @@ END(bcmp)

* void bcopy(const void *src, void *dst, size_t len)

+ENTRY(ovbcopy)

ENTRY(bcopy)

- BCOPY(%o0, %o1, %o2)

+ /*

+ * Check for overlap, and copy backwards if so.

+ */

+ sub %o1, %o0, %g1

+ cmp %g1, %o2

+ bgeu,a,pt %xcc, 3f

+ nop

+ /*

+ * Copy backwards.

+ */

+ add %o0, %o2, %o0

+ add %o1, %o2, %o1

+1: deccc 1, %o2

+ bl,a,pn %xcc, 2f

+ nop

+ dec 1, %o0

+ ldub [%o0], %g1

+ dec 1, %o1

+ b %xcc, 1b

+ stb %g1, [%o1]

+2: retl

+ nop

+ /*

+ * Do the fast version.

+ */

+3: _MEMCPY(%o1, %o0, %o2, E, E, E, E)

retl

nop

END(bcopy)

- * void ovbcopy(const void *src, void *dst, size_t len)

- * XXX handle overlap...

+ * void bzero(void *b, size_t len)

-ENTRY(ovbcopy)

- BCOPY(%o0, %o1, %o2)

+ENTRY(bzero)

+ _MEMSET(%o0, %g0, %o1, E, E)

retl

nop

-END(ovbcopy)

+END(bzero)

- * void bzero(void *b, size_t len)

+ * void physzero(vm_offset_t pa, size_t len)

-ENTRY(bzero)

- brz,pn %o1, 1f

+ENTRY(physzero)

+ wr %g0, ASI_PHYS_USE_EC, %asi

+ _MEMSET(%o0, %g0, %o1, a, %asi)

+ retl

nop

-1: deccc %o1

- stb %g0, [%o0]

- bne,pt %xcc, 1b

- inc %o0

-2: retl

+END(physzero)

+/*

+ * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)

+ */

+ENTRY(physcopy)

+ wr %g0, ASI_PHYS_USE_EC, %asi

+ _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)

+ retl

nop

-END(bzero)

+END(physcopy)

* void *memcpy(void *dst, const void *src, size_t len)

ENTRY(memcpy)

- BCOPY(%o1, %o0, %o2)

+ mov %o0, %o3

+ _MEMCPY(%o3, %o1, %o2, E, E, E, E)

retl

nop

END(memcpy)

+ * void *memset(void *b, int c, size_t len)

+ */

+ENTRY(memset)

+ mov %o0, %o3

+ _MEMSET(%o3, %o1, %o2, E, E)

+ retl

+ nop

+END(memset)

+/*

* int copyin(const void *uaddr, void *kaddr, size_t len)

ENTRY(copyin)

@@ -191,7 +357,8 @@ ENTRY(copyin)

stx %o2, [%o3 + KTR_PARM3]

#endif

- COPYIN(%o0, %o1, %o2)

+ wr %g0, ASI_AIUP, %asi

+ _MEMCPY(%o1, %o0, %o2, E, E, a, %asi)

CATCH_END()

retl

clr %o0

@@ -211,10 +378,11 @@ ENTRY(copyinstr)

stx %o3, [%g1 + KTR_PARM4]

#endif

- COPYINSTR(%o0, %o1, %o2, %o3)

+ wr %g0, ASI_AIUP, %asi

+ _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)

CATCH_END()

retl

- mov %o5, %o0

+ mov %g1, %o0

END(copyinstr)

@@ -230,7 +398,8 @@ ENTRY(copyout)

stx %o2, [%o3 + KTR_PARM3]

#endif

- COPYOUT(%o0, %o1, %o2)

+ wr %g0, ASI_AIUP, %asi

+ _MEMCPY(%o1, %o0, %o2, a, %asi, E, E)

CATCH_END()

retl

clr %o0

@@ -250,9 +419,9 @@ END(copyout)

* int copystr(const void *src, void *dst, size_t len, size_t *done)

ENTRY(copystr)

- COPYSTR(%o0, %o1, %o2, %o3)

+ _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)

retl

- mov %o5, %o0

+ mov %g1, %o0

END(copystr)

@@ -325,7 +494,6 @@ ENTRY(fsbail)

.Lfsalign:

retl

mov -1, %o0

-END(fsbail)

ENTRY(longjmp)

set 1, %g3

@@ -355,64 +523,17 @@ ENTRY(setjmp)

END(setjmp)

- * Temporary stack for calling into the firmware. We need to setup one, because

- * the MMU mapping for our stack page may be lost. When the firmware tries to

- * spill the last window (the others are flushed before), this results in an

- * DMMU miss trap, which is fatal with the firmware trap handlers installed.

- * Additionally, it seems that the firmware does not immediately switch to an

- * own stack (or maybe never?), therefore more space needs to be reserved.

- * I hope this is sufficient now.

- */

- .align 4

-DATA(ofwstack)

- .rept CCFSZ * 8

- .byte 0

- .endr

-ofwstack_last:

- .rept CCFSZ

- .byte 0

- .endr

-END(ofwstack)

-/*

* void openfirmware(cell_t args[])

ENTRY(openfirmware)

- /*

- * Disable interrupts. The firmware should not deal with our interrupts

- * anyway, and the temporary stack is not large enough to hold the stack

- * footprint of the interrrupt handling.

- */

- rdpr %pstate, %o3

- andn %o3, PSTATE_IE, %o1

- wrpr %o1, 0, %pstate

- setx ofwstack_last - SPOFF, %o1, %o2

- save %o2, 0, %sp

- flushw

- rdpr %tl, %l1

- rdpr %tba, %l2

- mov AA_DMMU_PCXR, %l3

- ldxa [%l3] ASI_DMMU, %l4

- stxa %g0, [%l3] ASI_DMMU

- membar #Sync

- flush %sp

- setx ofw_tba, %l7, %l5

- ldx [%l5], %l5

+ save %sp, -CCFSZ, %sp

setx ofw_vec, %l7, %l6

ldx [%l6], %l6

rdpr %pil, %l7

- wrpr %g0, 14, %pil

- wrpr %l5, 0, %tba

- wrpr %g0, 0, %tl

+ wrpr %g0, PIL_TICK, %pil

call %l6

mov %i0, %o0

- wrpr %l1, 0, %tl

- wrpr %l2, 0, %tba

- stxa %l4, [%l3] ASI_DMMU

wrpr %l7, 0, %pil

- membar #Sync

- flush %sp

- restore

- retl

- wrpr %o3, 0, %pstate

+ ret

+ restore %o0, %g0, %o0

END(openfirmware)