Merge branches 'core/futexes', 'core/locking', 'core/rcu' and 'linus' into core/urgent
[linux-2.6] / arch / x86 / include / asm / xor_64.h
1 #ifndef _ASM_X86_XOR_64_H
2 #define _ASM_X86_XOR_64_H
3
4 /*
5  * Optimized RAID-5 checksumming functions for MMX and SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16
17
18 /*
19  * Cache avoiding checksumming functions utilizing KNI instructions
20  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21  */
22
23 /*
24  * Based on
25  * High-speed RAID5 checksumming functions utilizing SSE instructions.
26  * Copyright (C) 1998 Ingo Molnar.
27  */
28
29 /*
30  * x86-64 changes / gcc fixes from Andi Kleen.
31  * Copyright 2002 Andi Kleen, SuSE Labs.
32  *
33  * This hasn't been optimized for the hammer yet, but there are likely
34  * no advantages to be gotten from x86-64 here anyways.
35  */
36
37 typedef struct {
38         unsigned long a, b;
39 } __attribute__((aligned(16))) xmm_store_t;
40
41 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
42    tell it to do a clts before the register saving. */
43 #define XMMS_SAVE                               \
44 do {                                            \
45         preempt_disable();                      \
46         asm volatile(                           \
47                 "movq %%cr0,%0          ;\n\t"  \
48                 "clts                   ;\n\t"  \
49                 "movups %%xmm0,(%1)     ;\n\t"  \
50                 "movups %%xmm1,0x10(%1) ;\n\t"  \
51                 "movups %%xmm2,0x20(%1) ;\n\t"  \
52                 "movups %%xmm3,0x30(%1) ;\n\t"  \
53                 : "=&r" (cr0)                   \
54                 : "r" (xmm_save)                \
55                 : "memory");                    \
56 } while (0)
57
58 #define XMMS_RESTORE                            \
59 do {                                            \
60         asm volatile(                           \
61                 "sfence                 ;\n\t"  \
62                 "movups (%1),%%xmm0     ;\n\t"  \
63                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
64                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
65                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
66                 "movq   %0,%%cr0        ;\n\t"  \
67                 :                               \
68                 : "r" (cr0), "r" (xmm_save)     \
69                 : "memory");                    \
70         preempt_enable();                       \
71 } while (0)
72
73 #define OFFS(x)         "16*("#x")"
74 #define PF_OFFS(x)      "256+16*("#x")"
75 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
76 #define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
77 #define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
78 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
79 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
80 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
81 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
82 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
83 #define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
84 #define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
85 #define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
86 #define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
87 #define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
88
89
90 static void
91 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92 {
93         unsigned int lines = bytes >> 8;
94         unsigned long cr0;
95         xmm_store_t xmm_save[4];
96
97         XMMS_SAVE;
98
99         asm volatile(
100 #undef BLOCK
101 #define BLOCK(i) \
102                 LD(i, 0)                                \
103                         LD(i + 1, 1)                    \
104                 PF1(i)                                  \
105                                 PF1(i + 2)              \
106                                 LD(i + 2, 2)            \
107                                         LD(i + 3, 3)    \
108                 PF0(i + 4)                              \
109                                 PF0(i + 6)              \
110                 XO1(i, 0)                               \
111                         XO1(i + 1, 1)                   \
112                                 XO1(i + 2, 2)           \
113                                         XO1(i + 3, 3)   \
114                 ST(i, 0)                                \
115                         ST(i + 1, 1)                    \
116                                 ST(i + 2, 2)            \
117                                         ST(i + 3, 3)    \
118
119
120                 PF0(0)
121                                 PF0(2)
122
123         " .align 32                     ;\n"
124         " 1:                            ;\n"
125
126                 BLOCK(0)
127                 BLOCK(4)
128                 BLOCK(8)
129                 BLOCK(12)
130
131         "       addq %[inc], %[p1]           ;\n"
132         "       addq %[inc], %[p2]           ;\n"
133                 "               decl %[cnt] ; jnz 1b"
134         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135         : [inc] "r" (256UL)
136         : "memory");
137
138         XMMS_RESTORE;
139 }
140
141 static void
142 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143           unsigned long *p3)
144 {
145         unsigned int lines = bytes >> 8;
146         xmm_store_t xmm_save[4];
147         unsigned long cr0;
148
149         XMMS_SAVE;
150
151         asm volatile(
152 #undef BLOCK
153 #define BLOCK(i) \
154                 PF1(i)                                  \
155                                 PF1(i + 2)              \
156                 LD(i, 0)                                        \
157                         LD(i + 1, 1)                    \
158                                 LD(i + 2, 2)            \
159                                         LD(i + 3, 3)    \
160                 PF2(i)                                  \
161                                 PF2(i + 2)              \
162                 PF0(i + 4)                              \
163                                 PF0(i + 6)              \
164                 XO1(i, 0)                               \
165                         XO1(i + 1, 1)                   \
166                                 XO1(i + 2, 2)           \
167                                         XO1(i + 3, 3)   \
168                 XO2(i, 0)                               \
169                         XO2(i + 1, 1)                   \
170                                 XO2(i + 2, 2)           \
171                                         XO2(i + 3, 3)   \
172                 ST(i, 0)                                \
173                         ST(i + 1, 1)                    \
174                                 ST(i + 2, 2)            \
175                                         ST(i + 3, 3)    \
176
177
178                 PF0(0)
179                                 PF0(2)
180
181         " .align 32                     ;\n"
182         " 1:                            ;\n"
183
184                 BLOCK(0)
185                 BLOCK(4)
186                 BLOCK(8)
187                 BLOCK(12)
188
189         "       addq %[inc], %[p1]           ;\n"
190         "       addq %[inc], %[p2]          ;\n"
191         "       addq %[inc], %[p3]           ;\n"
192                 "               decl %[cnt] ; jnz 1b"
193         : [cnt] "+r" (lines),
194           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195         : [inc] "r" (256UL)
196         : "memory");
197         XMMS_RESTORE;
198 }
199
200 static void
201 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202           unsigned long *p3, unsigned long *p4)
203 {
204         unsigned int lines = bytes >> 8;
205         xmm_store_t xmm_save[4];
206         unsigned long cr0;
207
208         XMMS_SAVE;
209
210         asm volatile(
211 #undef BLOCK
212 #define BLOCK(i) \
213                 PF1(i)                                  \
214                                 PF1(i + 2)              \
215                 LD(i, 0)                                \
216                         LD(i + 1, 1)                    \
217                                 LD(i + 2, 2)            \
218                                         LD(i + 3, 3)    \
219                 PF2(i)                                  \
220                                 PF2(i + 2)              \
221                 XO1(i, 0)                               \
222                         XO1(i + 1, 1)                   \
223                                 XO1(i + 2, 2)           \
224                                         XO1(i + 3, 3)   \
225                 PF3(i)                                  \
226                                 PF3(i + 2)              \
227                 PF0(i + 4)                              \
228                                 PF0(i + 6)              \
229                 XO2(i, 0)                               \
230                         XO2(i + 1, 1)                   \
231                                 XO2(i + 2, 2)           \
232                                         XO2(i + 3, 3)   \
233                 XO3(i, 0)                               \
234                         XO3(i + 1, 1)                   \
235                                 XO3(i + 2, 2)           \
236                                         XO3(i + 3, 3)   \
237                 ST(i, 0)                                \
238                         ST(i + 1, 1)                    \
239                                 ST(i + 2, 2)            \
240                                         ST(i + 3, 3)    \
241
242
243                 PF0(0)
244                                 PF0(2)
245
246         " .align 32                     ;\n"
247         " 1:                            ;\n"
248
249                 BLOCK(0)
250                 BLOCK(4)
251                 BLOCK(8)
252                 BLOCK(12)
253
254         "       addq %[inc], %[p1]           ;\n"
255         "       addq %[inc], %[p2]           ;\n"
256         "       addq %[inc], %[p3]           ;\n"
257         "       addq %[inc], %[p4]           ;\n"
258         "       decl %[cnt] ; jnz 1b"
259         : [cnt] "+c" (lines),
260           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261         : [inc] "r" (256UL)
262         : "memory" );
263
264         XMMS_RESTORE;
265 }
266
267 static void
268 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269           unsigned long *p3, unsigned long *p4, unsigned long *p5)
270 {
271         unsigned int lines = bytes >> 8;
272         xmm_store_t xmm_save[4];
273         unsigned long cr0;
274
275         XMMS_SAVE;
276
277         asm volatile(
278 #undef BLOCK
279 #define BLOCK(i) \
280                 PF1(i)                                  \
281                                 PF1(i + 2)              \
282                 LD(i, 0)                                \
283                         LD(i + 1, 1)                    \
284                                 LD(i + 2, 2)            \
285                                         LD(i + 3, 3)    \
286                 PF2(i)                                  \
287                                 PF2(i + 2)              \
288                 XO1(i, 0)                               \
289                         XO1(i + 1, 1)                   \
290                                 XO1(i + 2, 2)           \
291                                         XO1(i + 3, 3)   \
292                 PF3(i)                                  \
293                                 PF3(i + 2)              \
294                 XO2(i, 0)                               \
295                         XO2(i + 1, 1)                   \
296                                 XO2(i + 2, 2)           \
297                                         XO2(i + 3, 3)   \
298                 PF4(i)                                  \
299                                 PF4(i + 2)              \
300                 PF0(i + 4)                              \
301                                 PF0(i + 6)              \
302                 XO3(i, 0)                               \
303                         XO3(i + 1, 1)                   \
304                                 XO3(i + 2, 2)           \
305                                         XO3(i + 3, 3)   \
306                 XO4(i, 0)                               \
307                         XO4(i + 1, 1)                   \
308                                 XO4(i + 2, 2)           \
309                                         XO4(i + 3, 3)   \
310                 ST(i, 0)                                \
311                         ST(i + 1, 1)                    \
312                                 ST(i + 2, 2)            \
313                                         ST(i + 3, 3)    \
314
315
316                 PF0(0)
317                                 PF0(2)
318
319         " .align 32                     ;\n"
320         " 1:                            ;\n"
321
322                 BLOCK(0)
323                 BLOCK(4)
324                 BLOCK(8)
325                 BLOCK(12)
326
327         "       addq %[inc], %[p1]           ;\n"
328         "       addq %[inc], %[p2]           ;\n"
329         "       addq %[inc], %[p3]           ;\n"
330         "       addq %[inc], %[p4]           ;\n"
331         "       addq %[inc], %[p5]           ;\n"
332         "       decl %[cnt] ; jnz 1b"
333         : [cnt] "+c" (lines),
334           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335           [p5] "+r" (p5)
336         : [inc] "r" (256UL)
337         : "memory");
338
339         XMMS_RESTORE;
340 }
341
342 static struct xor_block_template xor_block_sse = {
343         .name = "generic_sse",
344         .do_2 = xor_sse_2,
345         .do_3 = xor_sse_3,
346         .do_4 = xor_sse_4,
347         .do_5 = xor_sse_5,
348 };
349
350 #undef XOR_TRY_TEMPLATES
351 #define XOR_TRY_TEMPLATES                       \
352 do {                                            \
353         xor_speed(&xor_block_sse);              \
354 } while (0)
355
356 /* We force the use of the SSE xor block because it can write around L2.
357    We may also be able to load into the L1 only depending on how the cpu
358    deals with a load to a line that is being prefetched.  */
359 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360
361 #endif /* _ASM_X86_XOR_64_H */