1 /* memcpy.S: Sparc optimized memcpy and memmove code
2 * Hand optimized from GNU libc's memcpy and memmove
3 * Copyright (C) 1991,1996 Free Software Foundation
4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
19 #undef FASTER_NONALIGNED
20 #define FASTER_ALIGNED
22 /* In kernel these functions don't return a value.
23 * One should use macros in asm/string.h for that purpose.
24 * We return 0, so that bugs are more apparent.
27 #define RETL_INSN clr %o0
35 #define FASTER_REVERSE
36 #define FASTER_NONALIGNED
37 #define FASTER_ALIGNED
39 #define SETUP_RETL mov %o0, %g6
40 #define RETL_INSN mov %g6, %o0
44 /* Both these macros have to start with exactly the same insn */
45 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
46 ldd [%src + (offset) + 0x00], %t0; \
47 ldd [%src + (offset) + 0x08], %t2; \
48 ldd [%src + (offset) + 0x10], %t4; \
49 ldd [%src + (offset) + 0x18], %t6; \
50 st %t0, [%dst + (offset) + 0x00]; \
51 st %t1, [%dst + (offset) + 0x04]; \
52 st %t2, [%dst + (offset) + 0x08]; \
53 st %t3, [%dst + (offset) + 0x0c]; \
54 st %t4, [%dst + (offset) + 0x10]; \
55 st %t5, [%dst + (offset) + 0x14]; \
56 st %t6, [%dst + (offset) + 0x18]; \
57 st %t7, [%dst + (offset) + 0x1c];
59 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
60 ldd [%src + (offset) + 0x00], %t0; \
61 ldd [%src + (offset) + 0x08], %t2; \
62 ldd [%src + (offset) + 0x10], %t4; \
63 ldd [%src + (offset) + 0x18], %t6; \
64 std %t0, [%dst + (offset) + 0x00]; \
65 std %t2, [%dst + (offset) + 0x08]; \
66 std %t4, [%dst + (offset) + 0x10]; \
67 std %t6, [%dst + (offset) + 0x18];
69 #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
70 ldd [%src - (offset) - 0x10], %t0; \
71 ldd [%src - (offset) - 0x08], %t2; \
72 st %t0, [%dst - (offset) - 0x10]; \
73 st %t1, [%dst - (offset) - 0x0c]; \
74 st %t2, [%dst - (offset) - 0x08]; \
75 st %t3, [%dst - (offset) - 0x04];
77 #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
78 ldd [%src - (offset) - 0x10], %t0; \
79 ldd [%src - (offset) - 0x08], %t2; \
80 std %t0, [%dst - (offset) - 0x10]; \
81 std %t2, [%dst - (offset) - 0x08];
83 #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
84 ldub [%src - (offset) - 0x02], %t0; \
85 ldub [%src - (offset) - 0x01], %t1; \
86 stb %t0, [%dst - (offset) - 0x02]; \
87 stb %t1, [%dst - (offset) - 0x01];
89 /* Both these macros have to start with exactly the same insn */
90 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
91 ldd [%src - (offset) - 0x20], %t0; \
92 ldd [%src - (offset) - 0x18], %t2; \
93 ldd [%src - (offset) - 0x10], %t4; \
94 ldd [%src - (offset) - 0x08], %t6; \
95 st %t0, [%dst - (offset) - 0x20]; \
96 st %t1, [%dst - (offset) - 0x1c]; \
97 st %t2, [%dst - (offset) - 0x18]; \
98 st %t3, [%dst - (offset) - 0x14]; \
99 st %t4, [%dst - (offset) - 0x10]; \
100 st %t5, [%dst - (offset) - 0x0c]; \
101 st %t6, [%dst - (offset) - 0x08]; \
102 st %t7, [%dst - (offset) - 0x04];
104 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
105 ldd [%src - (offset) - 0x20], %t0; \
106 ldd [%src - (offset) - 0x18], %t2; \
107 ldd [%src - (offset) - 0x10], %t4; \
108 ldd [%src - (offset) - 0x08], %t6; \
109 std %t0, [%dst - (offset) - 0x20]; \
110 std %t2, [%dst - (offset) - 0x18]; \
111 std %t4, [%dst - (offset) - 0x10]; \
112 std %t6, [%dst - (offset) - 0x08];
114 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
115 ldd [%src + (offset) + 0x00], %t0; \
116 ldd [%src + (offset) + 0x08], %t2; \
117 st %t0, [%dst + (offset) + 0x00]; \
118 st %t1, [%dst + (offset) + 0x04]; \
119 st %t2, [%dst + (offset) + 0x08]; \
120 st %t3, [%dst + (offset) + 0x0c];
122 #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
123 ldub [%src + (offset) + 0x00], %t0; \
124 ldub [%src + (offset) + 0x01], %t1; \
125 stb %t0, [%dst + (offset) + 0x00]; \
126 stb %t1, [%dst + (offset) + 0x01];
128 #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
129 ldd [%src + (offset) + 0x00], %t0; \
130 ldd [%src + (offset) + 0x08], %t2; \
131 srl %t0, shir, %t5; \
132 srl %t1, shir, %t6; \
133 sll %t0, shil, %t0; \
134 or %t5, %prev, %t5; \
135 sll %t1, shil, %prev; \
137 srl %t2, shir, %t1; \
138 srl %t3, shir, %t6; \
139 sll %t2, shil, %t2; \
140 or %t1, %prev, %t1; \
141 std %t4, [%dst + (offset) + (offset2) - 0x04]; \
142 std %t0, [%dst + (offset) + (offset2) + 0x04]; \
143 sll %t3, shil, %prev; \
146 #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
147 ldd [%src + (offset) + 0x00], %t0; \
148 ldd [%src + (offset) + 0x08], %t2; \
149 srl %t0, shir, %t4; \
150 srl %t1, shir, %t5; \
151 sll %t0, shil, %t6; \
152 or %t4, %prev, %t0; \
153 sll %t1, shil, %prev; \
155 srl %t2, shir, %t4; \
156 srl %t3, shir, %t5; \
157 sll %t2, shil, %t6; \
158 or %t4, %prev, %t2; \
159 sll %t3, shil, %prev; \
161 std %t0, [%dst + (offset) + (offset2) + 0x00]; \
162 std %t2, [%dst + (offset) + (offset2) + 0x08];
167 #ifdef FASTER_REVERSE
169 70: /* rdword_align */
189 #endif /* FASTER_REVERSE */
193 nop ! Only bcopy returns here and it retuns void...
210 #ifndef FASTER_REVERSE
217 1: /* reverse_bytes */
229 #else /* FASTER_REVERSE */
250 andcc %g1, 0xffffff80, %g7
256 RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
257 RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
258 RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
259 RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
274 jmpl %o5 + %lo(72f), %g0
277 71: /* rmemcpy_table */
278 RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
279 RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
280 RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
281 RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
282 RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
283 RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
284 RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
286 72: /* rmemcpy_table_end */
291 ldd [%o1 - 0x08], %g2
297 73: /* rmemcpy_last7 */
325 RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
326 RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
327 RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
328 RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
343 jmpl %o5 + %lo(72b), %g0
355 jmpl %o5 + %lo(76f), %g0
358 RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
359 RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
360 RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
361 RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
362 RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
363 RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
364 RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
366 76: /* rshort_table_end */
376 91: /* rshort_aligned_end */
394 77: /* rnon_aligned */
515 #endif /* FASTER_REVERSE */
517 /* NOTE: This code is executed just for the cases,
518 where %src (=%o1) & 3 is != 0.
519 We need to align it to 4. So, for (%src & 3)
520 1 we need to do ldub,lduh
523 so even if it looks weird, the branches
524 are correct here. -jj
526 78: /* dword_align */
549 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
575 andcc %g1, 0xffffff80, %g7
581 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
582 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
583 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
584 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
599 jmpl %o5 + %lo(80f), %g0
602 79: /* memcpy_table */
604 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
605 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
606 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
607 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
608 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
609 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
610 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
612 80: /* memcpy_table_end */
622 81: /* memcpy_last7 */
650 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
651 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
652 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
653 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
659 #ifndef FASTER_ALIGNED
670 jmpl %o5 + %lo(80b), %g0
673 #else /* FASTER_ALIGNED */
682 jmpl %o5 + %lo(84f), %g0
685 83: /* amemcpy_table */
687 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
688 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
689 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
690 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
691 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
692 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
693 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
695 84: /* amemcpy_table_end */
701 std %g2, [%o0 - 0x08]
703 85: /* amemcpy_last7 */
729 #endif /* FASTER_ALIGNED */
731 86: /* non_aligned */
735 #ifdef FASTER_NONALIGNED
740 #endif /* FASTER_NONALIGNED */
861 #ifdef FASTER_NONALIGNED
863 87: /* faster_nonaligned */
918 and %o2, 0xffffffc0, %o3
921 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
922 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
923 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
924 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
934 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
957 and %o2, 0xffffffc0, %o3
960 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
961 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
962 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
963 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
973 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
996 and %o2, 0xffffffc0, %o3
1001 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1002 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1003 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1004 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1010 andcc %o2, 0x30, %o3
1014 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1026 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1027 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1028 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1029 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1035 andcc %o2, 0x30, %o3
1039 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1052 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1053 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1054 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1055 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1061 andcc %o2, 0x30, %o3
1065 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1077 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1078 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1079 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1080 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1086 andcc %o2, 0x30, %o3
1090 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1102 #endif /* FASTER_NONALIGNED */
1113 jmpl %o5 + %lo(89f), %g0
1116 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
1117 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
1118 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
1119 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
1120 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
1121 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
1122 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
1124 89: /* short_table_end */
1135 90: /* short_aligned_end */
1142 ld [%o1 + 0x00], %g2
1143 ld [%o1 + 0x04], %g3
1145 st %g2, [%o0 + 0x00]
1146 st %g3, [%o0 + 0x04]