git.oblomov.eu Git - linux-2.6/blob - arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4
   5 #include <asm/cpufeature.h>
   6 #include <asm/dwarf2.h>
   7
   8 /*
   9  * memcpy - Copy a memory block.
  10  *
  11  * Input:
  12  *  rdi destination
  13  *  rsi source
  14  *  rdx count
  15  *
  16  * Output:
  17  * rax original destination
  18  */
  19
  20 /*
  21  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  22  *
  23  * Calls to this get patched into the kernel image via the
  24  * alternative instructions framework:
  25  */
  26         ALIGN
  27 memcpy_c:
  28         CFI_STARTPROC
  29         movq %rdi, %rax
  30
  31         movl %edx, %ecx
  32         shrl $3, %ecx
  33         andl $7, %edx
  34         rep movsq
  35         movl %edx, %ecx
  36         rep movsb
  37         ret
  38         CFI_ENDPROC
  39 ENDPROC(memcpy_c)
  40
  41 ENTRY(__memcpy)
  42 ENTRY(memcpy)
  43         CFI_STARTPROC
  44
  45         /*
  46          * Put the number of full 64-byte blocks into %ecx.
  47          * Tail portion is handled at the end:
  48          */
  49         movq %rdi, %rax
  50         movl %edx, %ecx
  51         shrl   $6, %ecx
  52         jz .Lhandle_tail
  53
  54         .p2align 4
  55 .Lloop_64:
  56         /*
  57          * We decrement the loop index here - and the zero-flag is
  58          * checked at the end of the loop (instructions inbetween do
  59          * not change the zero flag):
  60          */
  61         decl %ecx
  62
  63         /*
  64          * Move in blocks of 4x16 bytes:
  65          */
  66         movq 0*8(%rsi),         %r11
  67         movq 1*8(%rsi),         %r8
  68         movq %r11,              0*8(%rdi)
  69         movq %r8,               1*8(%rdi)
  70
  71         movq 2*8(%rsi),         %r9
  72         movq 3*8(%rsi),         %r10
  73         movq %r9,               2*8(%rdi)
  74         movq %r10,              3*8(%rdi)
  75
  76         movq 4*8(%rsi),         %r11
  77         movq 5*8(%rsi),         %r8
  78         movq %r11,              4*8(%rdi)
  79         movq %r8,               5*8(%rdi)
  80
  81         movq 6*8(%rsi),         %r9
  82         movq 7*8(%rsi),         %r10
  83         movq %r9,               6*8(%rdi)
  84         movq %r10,              7*8(%rdi)
  85
  86         leaq 64(%rsi), %rsi
  87         leaq 64(%rdi), %rdi
  88
  89         jnz  .Lloop_64
  90
  91 .Lhandle_tail:
  92         movl %edx, %ecx
  93         andl  $63, %ecx
  94         shrl   $3, %ecx
  95         jz   .Lhandle_7
  96
  97         .p2align 4
  98 .Lloop_8:
  99         decl %ecx
 100         movq (%rsi),            %r8
 101         movq %r8,               (%rdi)
 102         leaq 8(%rdi),           %rdi
 103         leaq 8(%rsi),           %rsi
 104         jnz  .Lloop_8
 105
 106 .Lhandle_7:
 107         movl %edx, %ecx
 108         andl $7, %ecx
 109         jz .Lend
 110
 111         .p2align 4
 112 .Lloop_1:
 113         movb (%rsi), %r8b
 114         movb %r8b, (%rdi)
 115         incq %rdi
 116         incq %rsi
 117         decl %ecx
 118         jnz .Lloop_1
 119
 120 .Lend:
 121         ret
 122         CFI_ENDPROC
 123 ENDPROC(memcpy)
 124 ENDPROC(__memcpy)
 125
 126         /*
 127          * Some CPUs run faster using the string copy instructions.
 128          * It is also a lot simpler. Use this when possible:
 129          */
 130
 131         .section .altinstr_replacement, "ax"
 132 1:      .byte 0xeb                              /* jmp <disp8> */
 133         .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
 134 2:
 135         .previous
 136
 137         .section .altinstructions, "a"
 138         .align 8
 139         .quad memcpy
 140         .quad 1b
 141         .byte X86_FEATURE_REP_GOOD
 142
 143         /*
 144          * Replace only beginning, memcpy is used to apply alternatives,
 145          * so it is silly to overwrite itself with nops - reboot is the
 146          * only outcome...
 147          */
 148         .byte 2b - 1b
 149         .byte 2b - 1b
 150         .previous