CRIS v32: Update lib/checksum.S and lib/checksumcopy.S

[linux-2.6-omap-h63xx.git] / arch / cris / arch-v32 / lib / checksumcopy.S
diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S

index 9303ccbadc6d91af82a03a84ae53eedbc37829ad..21aabe91489bf4d35c33d11d11c8d79f713d93cd 100644 (file)
--- a/arch/cris/arch-v32/lib/checksumcopy.S
+++ b/arch/cris/arch-v32/lib/checksumcopy.S
@@ -1,6 +1,6 @@
  /*
   * A fast checksum+copy routine using movem
- * Copyright (c) 1998, 2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
   *
   * Authors:    Bjorn Wesen
   *
@@ -16,32 +16,23 @@ csum_partial_copy_nocheck:
         ;; r12 - length
         ;; r13 - checksum
  
-       ;; check for breakeven length between movem and normal word looping versions
-       ;; we also do _NOT_ want to compute a checksum over more than the
-       ;; actual length when length < 40
-
-       cmpu.w  80,$r12
-       blo     _word_loop
-       nop
-
-       ;; need to save the registers we use below in the movem loop
-       ;; this overhead is why we have a check above for breakeven length
-       ;; only r0 - r8 have to be saved, the other ones are clobber-able
-       ;; according to the ABI
+       ;; Optimized for large packets
+       subq    10*4, $r12
+       blt     _word_loop
+       move.d  $r12, $acr
  
         subq    9*4,$sp
-       subq    10*4,$r12       ; update length for the first loop
+       clearf  c
         movem   $r8,[$sp]
  
         ;; do a movem copy and checksum
-
  1:     ;; A failing userspace access (the read) will have this as PC.
  _mloop:        movem   [$r10+],$r9     ; read 10 longwords
+       addoq   -10*4, $acr, $acr ; loop counter in latency cycle
         movem   $r9,[$r11+]     ; write 10 longwords
  
         ;; perform dword checksumming on the 10 longwords
-
-       add.d   $r0,$r13
+       addc    $r0,$r13
         addc    $r1,$r13
         addc    $r2,$r13
         addc    $r3,$r13
@@ -52,47 +43,30 @@ _mloop:     movem   [$r10+],$r9     ; read 10 longwords
         addc    $r8,$r13
         addc    $r9,$r13
  
-       ;; fold the carry into the checksum, to avoid having to loop the carry
-       ;; back into the top
-
-       addc    0,$r13
-       addc    0,$r13          ; do it again, since we might have generated a carry
-
-       subq    10*4,$r12
-       bge     _mloop
-       nop
-
-       addq    10*4,$r12       ; compensate for last loop underflowing length
+       ;; test $acr, without trashing carry.
+       move.d  $acr, $acr
+       bpl     _mloop
+       ;; r12 <= acr  is needed after mloop and in the exception handlers.
+       move.d  $acr, $r12
  
+       ;; fold the last carry into r13
+       addc    0, $r13
         movem   [$sp+],$r8      ; restore regs
  
  _word_loop:
-       ;; only fold if there is anything to fold.
-
-       cmpq    0,$r13
-       beq     _no_fold
+       addq    10*4,$r12       ; compensate for last loop underflowing length
  
         ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below
         ;; r9 can be used as temporary.
-
         move.d  $r13,$r9
         lsrq    16,$r9          ; r0 = checksum >> 16
         and.d   0xffff,$r13     ; checksum = checksum & 0xffff
-       add.d   $r9,$r13        ; checksum += r0
-       move.d  $r13,$r9        ; do the same again, maybe we got a carry last add
-       lsrq    16,$r9
-       and.d   0xffff,$r13
-       add.d   $r9,$r13
  
-_no_fold:
-       cmpq    2,$r12
+       subq    2, $r12
         blt     _no_words
-       nop
+       add.d   $r9,$r13        ; checksum += r0
  
         ;; copy and checksum the rest of the words
-
-       subq    2,$r12
-
  2:     ;; A failing userspace access for the read below will have this as PC.
  _wloop:        move.w  [$r10+],$r9
         addu.w  $r9,$r13
@@ -100,12 +74,9 @@ _wloop:     move.w  [$r10+],$r9
         bge     _wloop
         move.w  $r9,[$r11+]
  
-       addq    2,$r12
-
  _no_words:
-       ;; see if we have one odd byte more
-       cmpq    1,$r12
-       beq     _do_byte
+       addq    2,$r12
+       bne     _do_byte
         nop
         ret
         move.d  $r13,$r10