Merge by Hand
[linux-2.6] / arch / cris / arch-v32 / lib / memset.c
1 /*#************************************************************************#*/
2 /*#-------------------------------------------------------------------------*/
3 /*#                                                                         */
4 /*# FUNCTION NAME: memset()                                                 */
5 /*#                                                                         */
6 /*# PARAMETERS:  void* dst;   Destination address.                          */
7 /*#              int     c;   Value of byte to write.                       */
8 /*#              int   len;   Number of bytes to write.                     */
9 /*#                                                                         */
10 /*# RETURNS:     dst.                                                       */
11 /*#                                                                         */
12 /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
13 /*#              Framework taken from memcpy.  This routine is              */
14 /*#              very sensitive to compiler changes in register allocation. */
15 /*#              Should really be rewritten to avoid this problem.          */
16 /*#                                                                         */
17 /*#-------------------------------------------------------------------------*/
18 /*#                                                                         */
19 /*# HISTORY                                                                 */
20 /*#                                                                         */
21 /*# DATE      NAME            CHANGES                                       */
22 /*# ----      ----            -------                                       */
23 /*# 990713    HP              Tired of watching this function (or           */
24 /*#                           really, the nonoptimized generic              */
25 /*#                           implementation) take up 90% of simulator      */
26 /*#                           output.  Measurements needed.                 */
27 /*#                                                                         */
28 /*#-------------------------------------------------------------------------*/
29
30 #include <linux/types.h>
31
32 /* No, there's no macro saying 12*4, since it is "hard" to get it into
33    the asm in a good way.  Thus better to expose the problem everywhere.
34    */
35
36 /* Assuming 1 cycle per dword written or read (ok, not really true), and
37    one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
38    so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
39
40 #define ZERO_BLOCK_SIZE (1*12*4)
41
42 void *memset(void *pdst,
43              int c,
44              size_t plen)
45 {
46   /* Ok.  Now we want the parameters put in special registers.
47      Make sure the compiler is able to make something useful of this. */
48
49   register char *return_dst __asm__ ("r10") = pdst;
50   register int n __asm__ ("r12") = plen;
51   register int lc __asm__ ("r11") = c;
52
53   /* Most apps use memset sanely.  Only those memsetting about 3..4
54      bytes or less get penalized compared to the generic implementation
55      - and that's not really sane use. */
56
57   /* Ugh.  This is fragile at best.  Check with newer GCC releases, if
58      they compile cascaded "x |= x << 8" sanely! */
59   __asm__("movu.b %0,$r13       \n\
60            lslq 8,$r13          \n\
61            move.b %0,$r13       \n\
62            move.d $r13,%0       \n\
63            lslq 16,$r13         \n\
64            or.d $r13,%0"
65           : "=r" (lc) : "0" (lc) : "r13");
66
67   {
68     register char *dst __asm__ ("r13") = pdst;
69
70   /* This is NONPORTABLE, but since this whole routine is     */
71   /* grossly nonportable that doesn't matter.                 */
72
73   if (((unsigned long) pdst & 3) != 0
74      /* Oops! n=0 must be a legal call, regardless of alignment. */
75       && n >= 3)
76   {
77     if ((unsigned long)dst & 1)
78     {
79       *dst = (char) lc;
80       n--;
81       dst++;
82     }
83
84     if ((unsigned long)dst & 2)
85     {
86       *(short *)dst = lc;
87       n -= 2;
88       dst += 2;
89     }
90   }
91
92   /* Now the fun part.  For the threshold value of this, check the equation
93      above. */
94   /* Decide which copying method to use. */
95   if (n >= ZERO_BLOCK_SIZE)
96   {
97     /* For large copies we use 'movem' */
98
99   /* It is not optimal to tell the compiler about clobbering any
100      registers; that will move the saving/restoring of those registers
101      to the function prologue/epilogue, and make non-movem sizes
102      suboptimal.
103
104       This method is not foolproof; it assumes that the "asm reg"
105      declarations at the beginning of the function really are used
106      here (beware: they may be moved to temporary registers).
107       This way, we do not have to save/move the registers around into
108      temporaries; we can safely use them straight away.
109
110       If you want to check that the allocation was right; then
111       check the equalities in the first comment.  It should say
112       "r13=r13, r12=r12, r11=r11" */
113     __asm__ volatile ("                                                 \n\
114         ;; Check that the register asm declaration got right.           \n\
115         ;; The GCC manual says it will work, but there *has* been bugs. \n\
116         .ifnc %0-%1-%4,$r13-$r12-$r11                                   \n\
117         .err                                                            \n\
118         .endif                                                          \n\
119                                                                         \n\
120         ;; Save the registers we'll clobber in the movem process        \n\
121         ;; on the stack.  Don't mention them to gcc, it will only be    \n\
122         ;; upset.                                                       \n\
123         subq    11*4,$sp                                                \n\
124         movem   $r10,[$sp]                                              \n\
125                                                                         \n\
126         move.d  $r11,$r0                                                \n\
127         move.d  $r11,$r1                                                \n\
128         move.d  $r11,$r2                                                \n\
129         move.d  $r11,$r3                                                \n\
130         move.d  $r11,$r4                                                \n\
131         move.d  $r11,$r5                                                \n\
132         move.d  $r11,$r6                                                \n\
133         move.d  $r11,$r7                                                \n\
134         move.d  $r11,$r8                                                \n\
135         move.d  $r11,$r9                                                \n\
136         move.d  $r11,$r10                                               \n\
137                                                                         \n\
138         ;; Now we've got this:                                          \n\
139         ;; r13 - dst                                                    \n\
140         ;; r12 - n                                                      \n\
141                                                                         \n\
142         ;; Update n for the first loop                                  \n\
143         subq    12*4,$r12                                               \n\
144 0:                                                                      \n\
145         subq   12*4,$r12                                                \n\
146         bge     0b                                                      \n\
147         movem   $r11,[$r13+]                                            \n\
148                                                                         \n\
149         addq   12*4,$r12  ;; compensate for last loop underflowing n    \n\
150                                                                         \n\
151         ;; Restore registers from stack                                 \n\
152         movem [$sp+],$r10"
153
154      /* Outputs */ : "=r" (dst), "=r" (n)
155      /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
156   }
157
158     /* Either we directly starts copying, using dword copying
159        in a loop, or we copy as much as possible with 'movem'
160        and then the last block (<44 bytes) is copied here.
161        This will work since 'movem' will have updated src,dst,n. */
162
163     while ( n >= 16 )
164     {
165       *((long*)dst)++ = lc;
166       *((long*)dst)++ = lc;
167       *((long*)dst)++ = lc;
168       *((long*)dst)++ = lc;
169       n -= 16;
170     }
171
172     /* A switch() is definitely the fastest although it takes a LOT of code.
173      * Particularly if you inline code this.
174      */
175     switch (n)
176     {
177       case 0:
178         break;
179       case 1:
180         *(char*)dst = (char) lc;
181         break;
182       case 2:
183         *(short*)dst = (short) lc;
184         break;
185       case 3:
186         *((short*)dst)++ = (short) lc;
187         *(char*)dst = (char) lc;
188         break;
189       case 4:
190         *((long*)dst)++ = lc;
191         break;
192       case 5:
193         *((long*)dst)++ = lc;
194         *(char*)dst = (char) lc;
195         break;
196       case 6:
197         *((long*)dst)++ = lc;
198         *(short*)dst = (short) lc;
199         break;
200       case 7:
201         *((long*)dst)++ = lc;
202         *((short*)dst)++ = (short) lc;
203         *(char*)dst = (char) lc;
204         break;
205       case 8:
206         *((long*)dst)++ = lc;
207         *((long*)dst)++ = lc;
208         break;
209       case 9:
210         *((long*)dst)++ = lc;
211         *((long*)dst)++ = lc;
212         *(char*)dst = (char) lc;
213         break;
214       case 10:
215         *((long*)dst)++ = lc;
216         *((long*)dst)++ = lc;
217         *(short*)dst = (short) lc;
218         break;
219       case 11:
220         *((long*)dst)++ = lc;
221         *((long*)dst)++ = lc;
222         *((short*)dst)++ = (short) lc;
223         *(char*)dst = (char) lc;
224         break;
225       case 12:
226         *((long*)dst)++ = lc;
227         *((long*)dst)++ = lc;
228         *((long*)dst)++ = lc;
229         break;
230       case 13:
231         *((long*)dst)++ = lc;
232         *((long*)dst)++ = lc;
233         *((long*)dst)++ = lc;
234         *(char*)dst = (char) lc;
235         break;
236       case 14:
237         *((long*)dst)++ = lc;
238         *((long*)dst)++ = lc;
239         *((long*)dst)++ = lc;
240         *(short*)dst = (short) lc;
241         break;
242       case 15:
243         *((long*)dst)++ = lc;
244         *((long*)dst)++ = lc;
245         *((long*)dst)++ = lc;
246         *((short*)dst)++ = (short) lc;
247         *(char*)dst = (char) lc;
248         break;
249     }
250   }
251
252   return return_dst; /* destination pointer. */
253 } /* memset() */