Merge branch 'linus' into x86/cleanups
[linux-2.6] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11
12         .align  7
13 _GLOBAL(memcpy)
14         std     r3,48(r1)       /* save destination pointer for return value */
15         PPC_MTOCRF      0x01,r5
16         cmpldi  cr1,r5,16
17         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
18         andi.   r6,r6,7
19         dcbt    0,r4
20         blt     cr1,.Lshort_copy
21 /* Below we want to nop out the bne if we're on a CPU that has the
22    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
23    cleared.
24    At the time of writing the only CPU that has this combination of bits
25    set is Power6. */
26 BEGIN_FTR_SECTION
27         nop
28 FTR_SECTION_ELSE
29         bne     .Ldst_unaligned
30 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
31                     CPU_FTR_UNALIGNED_LD_STD)
32 .Ldst_aligned:
33         addi    r3,r3,-16
34 BEGIN_FTR_SECTION
35         andi.   r0,r4,7
36         bne     .Lsrc_unaligned
37 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
38         srdi    r7,r5,4
39         ld      r9,0(r4)
40         addi    r4,r4,-8
41         mtctr   r7
42         andi.   r5,r5,7
43         bf      cr7*4+0,2f
44         addi    r3,r3,8
45         addi    r4,r4,8
46         mr      r8,r9
47         blt     cr1,3f
48 1:      ld      r9,8(r4)
49         std     r8,8(r3)
50 2:      ldu     r8,16(r4)
51         stdu    r9,16(r3)
52         bdnz    1b
53 3:      std     r8,8(r3)
54         beq     3f
55         addi    r3,r3,16
56         ld      r9,8(r4)
57 .Ldo_tail:
58         bf      cr7*4+1,1f
59         rotldi  r9,r9,32
60         stw     r9,0(r3)
61         addi    r3,r3,4
62 1:      bf      cr7*4+2,2f
63         rotldi  r9,r9,16
64         sth     r9,0(r3)
65         addi    r3,r3,2
66 2:      bf      cr7*4+3,3f
67         rotldi  r9,r9,8
68         stb     r9,0(r3)
69 3:      ld      r3,48(r1)       /* return dest pointer */
70         blr
71
72 .Lsrc_unaligned:
73         srdi    r6,r5,3
74         addi    r5,r5,-16
75         subf    r4,r0,r4
76         srdi    r7,r5,4
77         sldi    r10,r0,3
78         cmpdi   cr6,r6,3
79         andi.   r5,r5,7
80         mtctr   r7
81         subfic  r11,r10,64
82         add     r5,r5,r0
83
84         bt      cr7*4+0,0f
85
86         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
87         ld      r0,8(r4)
88         sld     r6,r9,r10
89         ldu     r9,16(r4)
90         srd     r7,r0,r11
91         sld     r8,r0,r10
92         or      r7,r7,r6
93         blt     cr6,4f
94         ld      r0,8(r4)
95         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
96         b       2f
97
98 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
99         ldu     r9,8(r4)
100         sld     r8,r0,r10
101         addi    r3,r3,-8
102         blt     cr6,5f
103         ld      r0,8(r4)
104         srd     r12,r9,r11
105         sld     r6,r9,r10
106         ldu     r9,16(r4)
107         or      r12,r8,r12
108         srd     r7,r0,r11
109         sld     r8,r0,r10
110         addi    r3,r3,16
111         beq     cr6,3f
112
113         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
114 1:      or      r7,r7,r6
115         ld      r0,8(r4)
116         std     r12,8(r3)
117 2:      srd     r12,r9,r11
118         sld     r6,r9,r10
119         ldu     r9,16(r4)
120         or      r12,r8,r12
121         stdu    r7,16(r3)
122         srd     r7,r0,r11
123         sld     r8,r0,r10
124         bdnz    1b
125
126 3:      std     r12,8(r3)
127         or      r7,r7,r6
128 4:      std     r7,16(r3)
129 5:      srd     r12,r9,r11
130         or      r12,r8,r12
131         std     r12,24(r3)
132         beq     4f
133         cmpwi   cr1,r5,8
134         addi    r3,r3,32
135         sld     r9,r9,r10
136         ble     cr1,.Ldo_tail
137         ld      r0,8(r4)
138         srd     r7,r0,r11
139         or      r9,r7,r9
140         b       .Ldo_tail
141
142 .Ldst_unaligned:
143         PPC_MTOCRF      0x01,r6         # put #bytes to 8B bdry into cr7
144         subf    r5,r6,r5
145         li      r7,0
146         cmpldi  cr1,r5,16
147         bf      cr7*4+3,1f
148         lbz     r0,0(r4)
149         stb     r0,0(r3)
150         addi    r7,r7,1
151 1:      bf      cr7*4+2,2f
152         lhzx    r0,r7,r4
153         sthx    r0,r7,r3
154         addi    r7,r7,2
155 2:      bf      cr7*4+1,3f
156         lwzx    r0,r7,r4
157         stwx    r0,r7,r3
158 3:      PPC_MTOCRF      0x01,r5
159         add     r4,r6,r4
160         add     r3,r6,r3
161         b       .Ldst_aligned
162
163 .Lshort_copy:
164         bf      cr7*4+0,1f
165         lwz     r0,0(r4)
166         lwz     r9,4(r4)
167         addi    r4,r4,8
168         stw     r0,0(r3)
169         stw     r9,4(r3)
170         addi    r3,r3,8
171 1:      bf      cr7*4+1,2f
172         lwz     r0,0(r4)
173         addi    r4,r4,4
174         stw     r0,0(r3)
175         addi    r3,r3,4
176 2:      bf      cr7*4+2,3f
177         lhz     r0,0(r4)
178         addi    r4,r4,2
179         sth     r0,0(r3)
180         addi    r3,r3,2
181 3:      bf      cr7*4+3,4f
182         lbz     r0,0(r4)
183         stb     r0,0(r3)
184 4:      ld      r3,48(r1)       /* return dest pointer */
185         blr