Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / include / asm-x86 / xor_64.h
1 /*
2  * Optimized RAID-5 checksumming functions for MMX and SSE.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2, or (at your option)
7  * any later version.
8  *
9  * You should have received a copy of the GNU General Public License
10  * (for example /usr/src/linux/COPYING); if not, write to the Free
11  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
12  */
13
14
15 /*
16  * Cache avoiding checksumming functions utilizing KNI instructions
17  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
18  */
19
20 /*
21  * Based on
22  * High-speed RAID5 checksumming functions utilizing SSE instructions.
23  * Copyright (C) 1998 Ingo Molnar.
24  */
25
26 /*
27  * x86-64 changes / gcc fixes from Andi Kleen.
28  * Copyright 2002 Andi Kleen, SuSE Labs.
29  *
30  * This hasn't been optimized for the hammer yet, but there are likely
31  * no advantages to be gotten from x86-64 here anyways.
32  */
33
34 typedef struct {
35         unsigned long a, b;
36 } __attribute__((aligned(16))) xmm_store_t;
37
38 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
39    tell it to do a clts before the register saving. */
40 #define XMMS_SAVE                               \
41 do {                                            \
42         preempt_disable();                      \
43         asm volatile(                           \
44                 "movq %%cr0,%0          ;\n\t"  \
45                 "clts                   ;\n\t"  \
46                 "movups %%xmm0,(%1)     ;\n\t"  \
47                 "movups %%xmm1,0x10(%1) ;\n\t"  \
48                 "movups %%xmm2,0x20(%1) ;\n\t"  \
49                 "movups %%xmm3,0x30(%1) ;\n\t"  \
50                 : "=&r" (cr0)                   \
51                 : "r" (xmm_save)                \
52                 : "memory");                    \
53 } while (0)
54
55 #define XMMS_RESTORE                            \
56 do {                                            \
57         asm volatile(                           \
58                 "sfence                 ;\n\t"  \
59                 "movups (%1),%%xmm0     ;\n\t"  \
60                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
61                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
62                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
63                 "movq   %0,%%cr0        ;\n\t"  \
64                 :                               \
65                 : "r" (cr0), "r" (xmm_save)     \
66                 : "memory");                    \
67         preempt_enable();                       \
68 } while (0)
69
70 #define OFFS(x)         "16*("#x")"
71 #define PF_OFFS(x)      "256+16*("#x")"
72 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
73 #define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
74 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
75 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
76 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
77 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
78 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
79 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
80 #define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
81 #define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
82 #define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
83 #define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
84 #define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
85
86
87 static void
88 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
89 {
90         unsigned int lines = bytes >> 8;
91         unsigned long cr0;
92         xmm_store_t xmm_save[4];
93
94         XMMS_SAVE;
95
96         asm volatile(
97 #undef BLOCK
98 #define BLOCK(i) \
99                 LD(i, 0)                                \
100                         LD(i + 1, 1)                    \
101                 PF1(i)                                  \
102                                 PF1(i + 2)              \
103                                 LD(i + 2, 2)            \
104                                         LD(i + 3, 3)    \
105                 PF0(i + 4)                              \
106                                 PF0(i + 6)              \
107                 XO1(i, 0)                               \
108                         XO1(i + 1, 1)                   \
109                                 XO1(i + 2, 2)           \
110                                         XO1(i + 3, 3)   \
111                 ST(i, 0)                                \
112                         ST(i + 1, 1)                    \
113                                 ST(i + 2, 2)            \
114                                         ST(i + 3, 3)    \
115
116
117                 PF0(0)
118                                 PF0(2)
119
120         " .align 32                     ;\n"
121         " 1:                            ;\n"
122
123                 BLOCK(0)
124                 BLOCK(4)
125                 BLOCK(8)
126                 BLOCK(12)
127
128         "       addq %[inc], %[p1]           ;\n"
129         "       addq %[inc], %[p2]           ;\n"
130                 "               decl %[cnt] ; jnz 1b"
131         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
132         : [inc] "r" (256UL)
133         : "memory");
134
135         XMMS_RESTORE;
136 }
137
138 static void
139 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
140           unsigned long *p3)
141 {
142         unsigned int lines = bytes >> 8;
143         xmm_store_t xmm_save[4];
144         unsigned long cr0;
145
146         XMMS_SAVE;
147
148         asm volatile(
149 #undef BLOCK
150 #define BLOCK(i) \
151                 PF1(i)                                  \
152                                 PF1(i + 2)              \
153                 LD(i, 0)                                        \
154                         LD(i + 1, 1)                    \
155                                 LD(i + 2, 2)            \
156                                         LD(i + 3, 3)    \
157                 PF2(i)                                  \
158                                 PF2(i + 2)              \
159                 PF0(i + 4)                              \
160                                 PF0(i + 6)              \
161                 XO1(i, 0)                               \
162                         XO1(i + 1, 1)                   \
163                                 XO1(i + 2, 2)           \
164                                         XO1(i + 3, 3)   \
165                 XO2(i, 0)                               \
166                         XO2(i + 1, 1)                   \
167                                 XO2(i + 2, 2)           \
168                                         XO2(i + 3, 3)   \
169                 ST(i, 0)                                \
170                         ST(i + 1, 1)                    \
171                                 ST(i + 2, 2)            \
172                                         ST(i + 3, 3)    \
173
174
175                 PF0(0)
176                                 PF0(2)
177
178         " .align 32                     ;\n"
179         " 1:                            ;\n"
180
181                 BLOCK(0)
182                 BLOCK(4)
183                 BLOCK(8)
184                 BLOCK(12)
185
186         "       addq %[inc], %[p1]           ;\n"
187         "       addq %[inc], %[p2]          ;\n"
188         "       addq %[inc], %[p3]           ;\n"
189                 "               decl %[cnt] ; jnz 1b"
190         : [cnt] "+r" (lines),
191           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
192         : [inc] "r" (256UL)
193         : "memory");
194         XMMS_RESTORE;
195 }
196
197 static void
198 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
199           unsigned long *p3, unsigned long *p4)
200 {
201         unsigned int lines = bytes >> 8;
202         xmm_store_t xmm_save[4];
203         unsigned long cr0;
204
205         XMMS_SAVE;
206
207         asm volatile(
208 #undef BLOCK
209 #define BLOCK(i) \
210                 PF1(i)                                  \
211                                 PF1(i + 2)              \
212                 LD(i, 0)                                \
213                         LD(i + 1, 1)                    \
214                                 LD(i + 2, 2)            \
215                                         LD(i + 3, 3)    \
216                 PF2(i)                                  \
217                                 PF2(i + 2)              \
218                 XO1(i, 0)                               \
219                         XO1(i + 1, 1)                   \
220                                 XO1(i + 2, 2)           \
221                                         XO1(i + 3, 3)   \
222                 PF3(i)                                  \
223                                 PF3(i + 2)              \
224                 PF0(i + 4)                              \
225                                 PF0(i + 6)              \
226                 XO2(i, 0)                               \
227                         XO2(i + 1, 1)                   \
228                                 XO2(i + 2, 2)           \
229                                         XO2(i + 3, 3)   \
230                 XO3(i, 0)                               \
231                         XO3(i + 1, 1)                   \
232                                 XO3(i + 2, 2)           \
233                                         XO3(i + 3, 3)   \
234                 ST(i, 0)                                \
235                         ST(i + 1, 1)                    \
236                                 ST(i + 2, 2)            \
237                                         ST(i + 3, 3)    \
238
239
240                 PF0(0)
241                                 PF0(2)
242
243         " .align 32                     ;\n"
244         " 1:                            ;\n"
245
246                 BLOCK(0)
247                 BLOCK(4)
248                 BLOCK(8)
249                 BLOCK(12)
250
251         "       addq %[inc], %[p1]           ;\n"
252         "       addq %[inc], %[p2]           ;\n"
253         "       addq %[inc], %[p3]           ;\n"
254         "       addq %[inc], %[p4]           ;\n"
255         "       decl %[cnt] ; jnz 1b"
256         : [cnt] "+c" (lines),
257           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
258         : [inc] "r" (256UL)
259         : "memory" );
260
261         XMMS_RESTORE;
262 }
263
264 static void
265 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
266           unsigned long *p3, unsigned long *p4, unsigned long *p5)
267 {
268         unsigned int lines = bytes >> 8;
269         xmm_store_t xmm_save[4];
270         unsigned long cr0;
271
272         XMMS_SAVE;
273
274         asm volatile(
275 #undef BLOCK
276 #define BLOCK(i) \
277                 PF1(i)                                  \
278                                 PF1(i + 2)              \
279                 LD(i, 0)                                \
280                         LD(i + 1, 1)                    \
281                                 LD(i + 2, 2)            \
282                                         LD(i + 3, 3)    \
283                 PF2(i)                                  \
284                                 PF2(i + 2)              \
285                 XO1(i, 0)                               \
286                         XO1(i + 1, 1)                   \
287                                 XO1(i + 2, 2)           \
288                                         XO1(i + 3, 3)   \
289                 PF3(i)                                  \
290                                 PF3(i + 2)              \
291                 XO2(i, 0)                               \
292                         XO2(i + 1, 1)                   \
293                                 XO2(i + 2, 2)           \
294                                         XO2(i + 3, 3)   \
295                 PF4(i)                                  \
296                                 PF4(i + 2)              \
297                 PF0(i + 4)                              \
298                                 PF0(i + 6)              \
299                 XO3(i, 0)                               \
300                         XO3(i + 1, 1)                   \
301                                 XO3(i + 2, 2)           \
302                                         XO3(i + 3, 3)   \
303                 XO4(i, 0)                               \
304                         XO4(i + 1, 1)                   \
305                                 XO4(i + 2, 2)           \
306                                         XO4(i + 3, 3)   \
307                 ST(i, 0)                                \
308                         ST(i + 1, 1)                    \
309                                 ST(i + 2, 2)            \
310                                         ST(i + 3, 3)    \
311
312
313                 PF0(0)
314                                 PF0(2)
315
316         " .align 32                     ;\n"
317         " 1:                            ;\n"
318
319                 BLOCK(0)
320                 BLOCK(4)
321                 BLOCK(8)
322                 BLOCK(12)
323
324         "       addq %[inc], %[p1]           ;\n"
325         "       addq %[inc], %[p2]           ;\n"
326         "       addq %[inc], %[p3]           ;\n"
327         "       addq %[inc], %[p4]           ;\n"
328         "       addq %[inc], %[p5]           ;\n"
329         "       decl %[cnt] ; jnz 1b"
330         : [cnt] "+c" (lines),
331           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
332           [p5] "+r" (p5)
333         : [inc] "r" (256UL)
334         : "memory");
335
336         XMMS_RESTORE;
337 }
338
339 static struct xor_block_template xor_block_sse = {
340         .name = "generic_sse",
341         .do_2 = xor_sse_2,
342         .do_3 = xor_sse_3,
343         .do_4 = xor_sse_4,
344         .do_5 = xor_sse_5,
345 };
346
347 #undef XOR_TRY_TEMPLATES
348 #define XOR_TRY_TEMPLATES                       \
349 do {                                            \
350         xor_speed(&xor_block_sse);              \
351 } while (0)
352
353 /* We force the use of the SSE xor block because it can write around L2.
354    We may also be able to load into the L1 only depending on how the cpu
355    deals with a load to a line that is being prefetched.  */
356 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)