2 * "memcpy" implementation of SuperH
4 * Copyright (C) 1999 Niibe Yutaka
5 * Copyright (c) 2002 STMicroelectronics Ltd
6 * Modified from memcpy.S and micro-optimised for SH4
7 * Stuart Menefy (stuart.menefy@st.com)
10 #include <linux/linkage.h>
11 #include <linux/config.h>
14 * void *memcpy(void *dst, const void *src, size_t n);
16 * It is assumed that there is no overlap between src and dst.
17 * If there is an overlap, then the results are undefined.
21 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
24 ! Size is 16 or greater, and may have trailing bytes
28 ! Read a long word and write a long word at once
29 ! At the start of each iteration, r7 contains last long load
31 mov r4,r2 ! 5 MT (0 cycles latency)
33 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
38 #ifdef CONFIG_CPU_LITTLE_ENDIAN
39 ! 6 cycles, 4 bytes per iteration
40 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
41 mov r7, r3 ! 5 MT (latency=0) ! RQPO
46 mov r1,r6 ! 5 MT (latency=0)
47 shll8 r3 ! 102 EX ! Oxxx
49 shlr8 r6 ! 106 EX ! xNML
50 mov r1, r7 ! 5 MT (latency=0)
52 or r6,r3 ! 82 EX ! ONML
57 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
58 mov r7,r3 ! 5 MT (latency=0) ! OPQR
63 shlr8 r3 ! 106 EX ! xxxO
64 mov r1,r6 ! 5 MT (latency=0)
66 shll8 r6 ! 102 EX ! LMNx
67 mov r1,r7 ! 5 MT (latency=0)
69 or r6,r3 ! 82 EX ! LMNO
74 ! Finally, copy a byte at once, if necessary
82 8: cmp/hi r2,r0 ! 57 MT
83 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
94 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
97 ! Size is 16 or greater, and may have trailing bytes
101 ! Read a long word and write a long word at once
102 ! At the start of each iteration, r7 contains last long load
104 mov r4,r2 ! 5 MT (0 cycles latency)
106 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
111 #ifdef CONFIG_CPU_LITTLE_ENDIAN
112 ! 6 cycles, 4 bytes per iteration
113 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
114 mov r7, r3 ! 5 MT (latency=0) ! RQPO
117 shll8 r3 ! 102 EX ! QPOx
119 mov r1,r6 ! 5 MT (latency=0)
122 shlr8 r6 ! 106 EX ! xxxN
123 mov r1, r7 ! 5 MT (latency=0)
125 or r6,r3 ! 82 EX ! QPON
128 mov.l r3,@-r0 ! 30 LS
132 mov.l @(r0,r5),r1 ! KLMN
142 ! Finally, copy a byte at once, if necessary
150 8: cmp/hi r2,r0 ! 57 MT
151 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
155 mov.b r1,@-r0 ! 29 LS
162 ! Calculate the invariants which will be used in the remainder
165 ! r4 --> [ ... ] DST [ ... ] SRC
168 ! r0 --> [ ... ] r0+r5 --> [ ... ]
172 ! Short circuit the common case of src, dst and len being 32 bit aligned
173 ! and test for zero length move
175 mov r6, r0 ! 5 MT (0 cycle latency)
181 bt/s 99f ! 111 BR (zero len)
184 mov r4, r0 ! 5 MT (0 cycle latency)
188 bt/s .Lcase00 ! 111 BR (aligned)
192 ! Arguments are not nicely long word aligned or zero len.
193 ! Check for small copies, and if so do a simple byte at a time copy.
195 ! Deciding on an exact value of 'small' is not easy, as the point at which
196 ! using the optimised routines become worthwhile varies (these are the
197 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198 ! size byte-at-time long word byte
199 ! 16 42 39-40 46-50 50-55
200 ! 24 58 43-44 54-58 62-67
201 ! 36 82 49-50 66-70 80-85
202 ! However the penalty for getting it 'wrong' is much higher for long word
203 ! aligned data (and this is more common), so use a value of 16.
208 bf/s 6f ! 108 BR (not small)
210 mov r5, r3 ! 5 MT (latency=0)
213 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
220 mov.b r1,@-r0 ! 29 LS
222 ! 4 cycles, 2 bytes per iteration
223 3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
225 4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
228 mov.b r1,@-r0 ! 29 LS
231 mov.b r2,@-r0 ! 29 LS
239 ! Size is not small, so its worthwhile looking for optimisations.
240 ! First align destination to a long word boundary.
242 ! r5 = normal value -1
244 6: tst #3, r0 ! 87 MT
250 ! 3 cycles, 1 byte per iteration
252 mov.b @(r0,r5),r1 ! 19 LS (latency=2)
257 mov.b r1,@-r0 ! 28 LS
259 2: add #1, r5 ! 79 EX
261 ! Now select the appropriate bulk transfer code based on relative
262 ! alignment of src and dst.
264 mov r0, r3 ! 5 MT (latency=0)
266 mov r5, r0 ! 5 MT (latency=0)
274 cmp/ge r7, r6 ! 55 MT
294 1: tst #2, r0 ! 87 MT
304 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
307 ! src, dst and size are all long word aligned
313 mov r5, r3 ! 5 MT (latency=0)
315 cmp/gt r6, r1 ! 56 MT
318 bf .Lcase00b ! 108 BR (big loop)
322 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
330 mov.l r1,@-r0 ! 30 LS
332 ! 4 cycles, 2 long words per iteration
333 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
335 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
338 mov.l r1, @-r0 ! 30 LS
341 mov.l r2, @-r0 ! 30 LS
347 ! Size is 16 or greater and less than 64, but may have trailing bytes
352 mov r4, r7 ! 5 MT (latency=0)
354 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
360 mov r5, r3 ! 5 MT (latency=0)
364 mov.l r1,@-r0 ! 30 LS
366 ! 4 cycles, 2 long words per iteration
367 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
369 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
372 mov.l r1, @-r0 ! 30 LS
375 mov.l r2, @-r0 ! 30 LS
377 ! Copy the final 0-3 bytes
381 cmp/eq r0, r4 ! 54 MT
386 ! 3 cycles, 1 byte per iteration
387 1: mov.b @(r0,r5),r1 ! 19 LS
391 mov.b r1,@-r0 ! 28 LS
396 ! Size is at least 64 bytes, so will be going round the big loop at least once.
399 ! r3 = rounded down r0
406 mov r0, r3 ! 5 MT (latency=0)
407 mov #(~0x1f), r1 ! 6 EX
410 mov r4, r2 ! 5 MT (latency=0)
412 cmp/eq r3, r0 ! 54 MT
413 add #0x1f, r2 ! 50 EX
418 ! copy initial words until cache line aligned
420 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
423 mov r5, r6 ! 5 MT (latency=0)
429 tst #0x18, r0 ! 87 MT
432 mov.l r1,@-r0 ! 30 LS
434 ! 4 cycles, 2 long words per iteration
435 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
437 4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
438 cmp/eq r3, r0 ! 54 MT
440 mov.l r1, @-r0 ! 30 LS
443 mov.l r7, @-r0 ! 30 LS
445 ! Copy the cache line aligned blocks
447 ! In use: r0, r2, r4, r5
448 ! Scratch: r1, r3, r6, r7
450 ! We could do this with the four scratch registers, but if src
451 ! and dest hit the same cache line, this will thrash, so make
452 ! use of additional registers.
454 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455 ! r5: src (was r0+r5)
457 ! this can be reversed at the end, so we don't need to save any extra
460 1: mov.l r8, @-r15 ! 30 LS
463 mov.l r9, @-r15 ! 30 LS
464 mov r0, r1 ! 5 MT (latency=0)
466 mov.l r10, @-r15 ! 30 LS
467 add #-0x1c, r5 ! 50 EX
469 mov.l r11, @-r15 ! 30 LS
471 ! 16 cycles, 32 bytes per iteration
472 2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
473 add #-0x20, r1 ! 50 EX
474 mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
475 mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
476 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
477 mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
478 mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
479 mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
480 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
481 movca.l r0,@r1 ! 40 LS (latency=3-7)
482 mov.l r3,@(0x04,r1) ! 33 LS
483 mov.l r6,@(0x08,r1) ! 33 LS
484 mov.l r7,@(0x0c,r1) ! 33 LS
486 mov.l r8,@(0x10,r1) ! 33 LS
487 add #-0x20, r5 ! 50 EX
489 mov.l r9,@(0x14,r1) ! 33 LS
492 mov.l r10,@(0x18,r1) ! 33 LS
495 mov.l r11,@(0x1c,r1) ! 33 LS
497 mov r1, r0 ! 5 MT (latency=0)
499 mov.l @r15+, r11 ! 15 LS
502 mov.l @r15+, r10 ! 15 LS
503 cmp/eq r4, r0 ! 54 MT
506 mov.l @r15+, r9 ! 15 LS
509 1: mov.l @r15+, r8 ! 15 LS
510 sub r4, r1 ! 75 EX (len remaining)
512 ! number of trailing bytes is non-zero
514 ! invariants restored (r5 already decremented by 4)
515 ! also r1=num bytes remaining
518 mov r4, r7 ! 5 MT (latency=0)
520 add #0x1c, r5 ! 50 EX (back to -4)
521 cmp/hs r2, r1 ! 58 MT
526 mov.l @(r0, r5), r6 ! 21 LS (latency=2)
529 mov r5, r3 ! 5 MT (latency=0)
533 cmp/hs r2, r1 ! 58 MT
536 mov.l r6,@-r0 ! 30 LS
538 ! 4 cycles, 2 long words per iteration
539 3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
541 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
544 mov.l r6, @-r0 ! 30 LS
547 mov.l r2, @-r0 ! 30 LS
549 ! Copy the final 0-3 bytes
551 5: cmp/eq r0, r4 ! 54 MT
557 ! 3 cycles, 1 byte per iteration
558 1: mov.b @(r0,r5),r1 ! 19 LS
562 mov.b r1,@-r0 ! 28 LS
568 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
573 ! Size is 16 or greater and less then 64, but may have trailing bytes
575 2: mov r5, r6 ! 5 MT (latency=0)
578 mov r4,r2 ! 5 MT (latency=0)
582 3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
584 mov.w @(r0,r6),r3 ! 20 LS (latency=2)
587 mov.w r1,@-r0 ! 29 LS
590 mov.w r3,@-r0 ! 29 LS
598 ! Size is at least 64 bytes, so will be going round the big loop at least once.
601 ! r3 = rounded down r0
603 mov r0, r3 ! 5 MT (latency=0)
604 mov #(~0x1f), r1 ! 6 EX
607 mov r4, r2 ! 5 MT (latency=0)
609 cmp/eq r3, r0 ! 54 MT
610 add #0x1f, r2 ! 50 EX
616 ! Copy a short word one at a time until we are cache line aligned
617 ! Normal values: r0, r2, r3, r4
623 2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
628 mov.w r1,@-r0 ! 29 LS
630 ! Copy the cache line aligned blocks
632 ! In use: r0, r2, r4, r5 (=r5-2)
633 ! Scratch: r1, r3, r6, r7
635 ! We could do this with the four scratch registers, but if src
636 ! and dest hit the same cache line, this will thrash, so make
637 ! use of additional registers.
639 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640 ! r5: src (was r0+r5)
642 ! this can be reversed at the end, so we don't need to save any extra
645 1: mov.l r8, @-r15 ! 30 LS
648 mov.l r9, @-r15 ! 30 LS
649 mov r0, r1 ! 5 MT (latency=0)
651 mov.l r10, @-r15 ! 30 LS
652 add #-0x1e, r5 ! 50 EX
654 mov.l r11, @-r15 ! 30 LS
656 mov.l r12, @-r15 ! 30 LS
658 ! 17 cycles, 32 bytes per iteration
659 #ifdef CONFIG_CPU_LITTLE_ENDIAN
660 2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
661 add #-0x20, r1 ! 50 EX
663 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
665 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
666 shll16 r0 ! 103 EX JI..
668 mov.l @r5+, r7 ! 15 LS (latency=2)
669 xtrct r3, r0 ! 48 EX LKJI
671 mov.l @r5+, r8 ! 15 LS (latency=2)
672 xtrct r6, r3 ! 48 EX PONM
674 mov.l @r5+, r9 ! 15 LS (latency=2)
677 mov.l @r5+, r10 ! 15 LS (latency=2)
680 mov.l @r5+, r11 ! 15 LS (latency=2)
683 mov.w @r5+, r12 ! 15 LS (latency=2)
684 xtrct r10, r9 ! 48 EX
686 movca.l r0,@r1 ! 40 LS (latency=3-7)
687 xtrct r11, r10 ! 48 EX
689 mov.l r3, @(0x04,r1) ! 33 LS
690 xtrct r12, r11 ! 48 EX
692 mov.l r6, @(0x08,r1) ! 33 LS
694 mov.l r7, @(0x0c,r1) ! 33 LS
696 mov.l r8, @(0x10,r1) ! 33 LS
697 add #-0x40, r5 ! 50 EX
699 mov.l r9, @(0x14,r1) ! 33 LS
702 mov.l r10, @(0x18,r1) ! 33 LS
705 mov.l r11, @(0x1c,r1) ! 33 LS
707 2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
710 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
713 mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
716 mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
719 mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
722 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
725 mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
728 mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
731 mov.w @(0x02,r5), r12 ! 18 LS (latency=2)
732 xtrct r10, r9 ! 48 EX
734 movca.l r0,@r1 ! 40 LS (latency=3-7)
735 add #-0x1c, r1 ! 50 EX
737 mov.l r3, @(0x1c,r1) ! 33 LS
738 xtrct r11, r10 ! 48 EX
740 mov.l r6, @(0x18,r1) ! 33 LS
741 xtrct r12, r11 ! 48 EX
743 mov.l r7, @(0x14,r1) ! 33 LS
745 mov.l r8, @(0x10,r1) ! 33 LS
746 add #-0x3e, r5 ! 50 EX
748 mov.l r9, @(0x0c,r1) ! 33 LS
751 mov.l r10, @(0x08,r1) ! 33 LS
754 mov.l r11, @(0x04,r1) ! 33 LS
758 mov r1, r0 ! 5 MT (latency=0)
760 mov.l @r15+, r11 ! 15 LS
763 mov.l @r15+, r10 ! 15 LS
764 cmp/eq r4, r0 ! 54 MT
767 mov.l @r15+, r9 ! 15 LS
770 1: mov.l @r15+, r8 ! 15 LS
772 add #0x1e, r5 ! 50 EX
774 ! Finish off a short word at a time
775 ! r5 must be invariant - 2
776 10: mov r4,r2 ! 5 MT (latency=0)
779 cmp/hi r2, r0 ! 57 MT
784 3: mov.w @(r0,r5),r1 ! 20 LS
789 mov.w r1,@-r0 ! 29 LS
793 ! Finally, copy the last byte if necessary