sh: __copy_user() optimizations for small copies.

author Stuart Menefy <stuart.menefy@st.com>

Fri, 28 Sep 2007 03:36:35 +0000 (12:36 +0900)

committer Paul Mundt <lethal@linux-sh.org>

Fri, 28 Sep 2007 03:36:35 +0000 (12:36 +0900)
author Stuart Menefy <stuart.menefy@st.com>
Fri, 28 Sep 2007 03:36:35 +0000 (12:36 +0900)
committer Paul Mundt <lethal@linux-sh.org>
Fri, 28 Sep 2007 03:36:35 +0000 (12:36 +0900)
diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S

index ae039f2da16271c75e3013b97fa8a523a3b97621..a81dbdb05596993a580e9fb69d876133c1990d9c 100644 (file)
--- a/arch/sh/mm/copy_page.S
+++ b/arch/sh/mm/copy_page.S
@@ -141,47 +141,38 @@ ENTRY(__copy_user_page)
         .long 9999b, 6000f      ;       \
         .previous
  ENTRY(__copy_user)
-       tst     r6,r6           ! Check explicitly for zero
-       bf      1f
-       rts
-        mov    #0,r0           ! normal return
-1:
-       mov.l   r10,@-r15
-       mov.l   r9,@-r15
-       mov.l   r8,@-r15
+       ! Check if small number of bytes
+       mov     #11,r0
         mov     r4,r3
-       add     r6,r3           ! last destination address
-       mov     #12,r0          ! Check if small number of bytes
-       cmp/gt  r0,r6
-       bt      2f
-       bra     .L_cleanup_loop
-        nop
-2:
-       neg     r5,r0           ! Calculate bytes needed to align source
+       cmp/gt  r0,r6           ! r6 (len) > r0 (11)
+       bf/s    .L_cleanup_loop_no_pop
+        add    r6,r3           ! last destination address
+
+       ! Calculate bytes needed to align to src
+       mov.l   r11,@-r15
+       neg     r5,r0
+       mov.l   r10,@-r15
         add     #4,r0
+       mov.l   r9,@-r15
         and     #3,r0
+       mov.l   r8,@-r15
         tst     r0,r0
-       bt      .L_jump
-       mov     r0,r1
+       bt      2f
  
-.L_loop1:
-       ! Copy bytes to align source
-EX(    mov.b   @r5+,r0         )
-       dt      r1
-EX(    mov.b   r0,@r4          )
+1:
+       ! Copy bytes to long word align src
+EX(    mov.b   @r5+,r1         )
+       dt      r0
         add     #-1,r6
-       bf/s    .L_loop1
+EX(    mov.b   r1,@r4          )
+       bf/s    1b
          add    #1,r4
  
-.L_jump:
-       mov     r6,r2           ! Calculate number of longwords to copy
+       ! Jump to appropriate routine depending on dest
+2:     mov     #3,r1
+       mov     r6, r2
+       and     r4,r1
         shlr2   r2
-       tst     r2,r2
-       bt      .L_cleanup
-
-       mov     r4,r0           ! Jump to appropriate routine
-       and     #3,r0
-       mov     r0,r1
         shll2   r1
         mova    .L_jump_tbl,r0
         mov.l   @(r0,r1),r1
@@ -195,43 +186,97 @@ EX(       mov.b   r0,@r4          )
         .long   .L_dest10
         .long   .L_dest11
  
+/*
+ * Come here if there are less than 12 bytes to copy
+ *
+ * Keep the branch target close, so the bf/s callee doesn't overflow
+ * and result in a more expensive branch being inserted. This is the
+ * fast-path for small copies, the jump via the jump table will hit the
+ * default slow-path cleanup. -PFM.
+ */
+.L_cleanup_loop_no_pop:
+       tst     r6,r6           ! Check explicitly for zero
+       bt      1f
+
+2:
+EX(    mov.b   @r5+,r0         )
+       dt      r6
+EX(    mov.b   r0,@r4          )
+       bf/s    2b
+        add    #1,r4
+
+1:     mov     #0,r0           ! normal return
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+       mov.l   8000f,r1
+       mov     r3,r0
+       jmp     @r1
+        sub    r4,r0
+       .align  2
+8000:  .long   5000b
+
+.previous
+       rts
+        nop
+
  ! Destination = 00
  
  .L_dest00:
-       mov     r2,r7
-       shlr2   r7
-       shlr    r7
-       tst     r7,r7
-       mov     #7,r0
-       bt/s    1f
-        and    r0,r2
-       .align 2
+       ! Skip the large copy for small transfers
+       mov     #(32+32-4), r0
+       cmp/gt  r6, r0          ! r0 (60) > r6 (len)
+       bt      1f
+
+       ! Align dest to a 32 byte boundary
+       neg     r4,r0
+       add     #0x20, r0
+       and     #0x1f, r0
+       tst     r0, r0
+       bt      2f
+
+       sub     r0, r6
+       shlr2   r0
+3:
+EX(    mov.l   @r5+,r1         )
+       dt      r0
+EX(    mov.l   r1,@r4          )
+       bf/s    3b
+        add    #4,r4
+
  2:
  EX(    mov.l   @r5+,r0         )
+EX(    mov.l   @r5+,r1         )
+EX(    mov.l   @r5+,r2         )
+EX(    mov.l   @r5+,r7         )
  EX(    mov.l   @r5+,r8         )
  EX(    mov.l   @r5+,r9         )
  EX(    mov.l   @r5+,r10        )
-EX(    mov.l   r0,@r4          )
-EX(    mov.l   r8,@(4,r4)      )
-EX(    mov.l   r9,@(8,r4)      )
-EX(    mov.l   r10,@(12,r4)    )
-EX(    mov.l   @r5+,r0         )
-EX(    mov.l   @r5+,r8         )
-EX(    mov.l   @r5+,r9         )
-EX(    mov.l   @r5+,r10        )
-       dt      r7
-EX(    mov.l   r0,@(16,r4)     )
-EX(    mov.l   r8,@(20,r4)     )
-EX(    mov.l   r9,@(24,r4)     )
-EX(    mov.l   r10,@(28,r4)    )
+EX(    mov.l   @r5+,r11        )
+EX(    movca.l r0,@r4          )
+       add     #-32, r6
+EX(    mov.l   r1,@(4,r4)      )
+       mov     #32, r0
+EX(    mov.l   r2,@(8,r4)      )
+       cmp/gt  r6, r0          ! r0 (32) > r6 (len)
+EX(    mov.l   r7,@(12,r4)     )
+EX(    mov.l   r8,@(16,r4)     )
+EX(    mov.l   r9,@(20,r4)     )
+EX(    mov.l   r10,@(24,r4)    )
+EX(    mov.l   r11,@(28,r4)    )
         bf/s    2b
          add    #32,r4
-       tst     r2,r2
+
+1:     mov     r6, r0
+       shlr2   r0
+       tst     r0, r0
         bt      .L_cleanup
  1:
-EX(    mov.l   @r5+,r0         )
-       dt      r2
-EX(    mov.l   r0,@r4          )
+EX(    mov.l   @r5+,r1         )
+       dt      r0
+EX(    mov.l   r1,@r4          )
         bf/s    1b
          add    #4,r4
  
@@ -250,7 +295,7 @@ EX( mov.l   r0,@r4          )
          and    r0,r2
  2:
         dt      r7
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
  EX(    mov.l   @r5+,r0         )
  EX(    mov.l   @r5+,r1         )
  EX(    mov.l   @r5+,r8         )
@@ -320,7 +365,7 @@ EX( mov.w   r0,@(2,r4)      )
  1:     ! Read longword, write two words per iteration
  EX(    mov.l   @r5+,r0         )
         dt      r2
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
  EX(    mov.w   r0,@r4          )
         shlr16  r0
  EX(    mov.w   r0,@(2,r4)      )
@@ -342,7 +387,7 @@ EX( mov.w   r0,@r4          )
         ! Read longword, write byte, word, byte per iteration
  EX(    mov.l   @r5+,r0         )
         dt      r2
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
  EX(    mov.b   r0,@r4          )
         shlr8   r0
         add     #1,r4
@@ -379,6 +424,7 @@ EX( mov.b   r0,@r4          )
  
  .L_exit:
         mov     #0,r0           ! normal return
+
  5000:
  
  # Exception handler:
@@ -394,5 +440,6 @@ EX( mov.b   r0,@r4          )
  .previous
         mov.l   @r15+,r8
         mov.l   @r15+,r9
+       mov.l   @r15+,r10
         rts
-        mov.l  @r15+,r10
+        mov.l  @r15+,r11
author	Stuart Menefy <stuart.menefy@st.com>
	Fri, 28 Sep 2007 03:36:35 +0000 (12:36 +0900)
committer	Paul Mundt <lethal@linux-sh.org>
	Fri, 28 Sep 2007 03:36:35 +0000 (12:36 +0900)