aboutsummaryrefslogtreecommitdiff
path: root/Additional_Implementations/skein_block_xmm32.asm
diff options
context:
space:
mode:
Diffstat (limited to 'Additional_Implementations/skein_block_xmm32.asm')
-rw-r--r--Additional_Implementations/skein_block_xmm32.asm1167
1 files changed, 1167 insertions, 0 deletions
diff --git a/Additional_Implementations/skein_block_xmm32.asm b/Additional_Implementations/skein_block_xmm32.asm
new file mode 100644
index 000000000000..96ef121cd49a
--- /dev/null
+++ b/Additional_Implementations/skein_block_xmm32.asm
@@ -0,0 +1,1167 @@
+;
+;----------------------------------------------------------------
+; 32-bit x86 assembler code for Skein block functions using XMM registers
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+ .386p
+ .model flat
+ .code
+ .xmm ;enable XMM instructions
+;
+_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_ = SKEIN_USE_ASM
+else
+_USE_ASM_ = _MASK_ALL_
+endif
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP
+_SKEIN_LOOP = 0 ;default is all fully unrolled
+else
+_SKEIN_LOOP = SKEIN_LOOP
+endif
+;--------------
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10
+;
+SKEIN_ASM_UNROLL = 0
+ irp _NN_,<256,512,1024>
+ if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_
+ endif
+ endm
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5)
+endif
+irp _NN_,<256,512,1024>
+ if _USE_ASM_ and _NN_
+ irp _RR_,<%(ROUNDS_&_NN_)>
+ if _NN_ eq 1024
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ else
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ endif
+ endm
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE equ (1)
+else
+ifdef SKEIN_PERF ;use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+else
+_SKEIN_DEBUG = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS = 0 ;# bits of hash output
+BCNT = 4 + HASH_BITS ;number of bytes in BUFFER[]
+TWEAK = 4 + BCNT ;tweak values[0..1]
+X_VARS = 16 + TWEAK ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+KW_PARITY_LO= 0A9FC1A22h ;overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI= 01BD11BDAh
+FIRST_MASK8 = NOT (1 SHL 6) ;FIRST block flag bit
+;
+; rotation constants for Skein
+;
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+mov64 macro x0,x1
+ movq x0,x1
+endm
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar macro localName,localSize
+localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro WCNT,RND_CNT
+_STK_OFFS_ = 0 ;starting offset from esp, forced on 16-byte alignment
+ ;----- local variables ;<-- esp
+ StackVar X_stk , 8*(WCNT) ;local context vars
+ StackVar Wcopy , 8*(WCNT) ;copy of input block
+ StackVar ksTwk ,16*3 ;key schedule: tweak words
+ StackVar ksKey ,16*(WCNT)+16;key schedule: key words
+FRAME_OFFS = ksTwk+128 ;<-- ebp
+ if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0
+ StackVar ksRot,16*(RND_CNT/4);leave space for ks "rotation" to happen
+ endif
+LOCAL_SIZE = _STK_OFFS_ ;size of local vars
+ ;
+ ;"restart" the stack defns, because we relocate esp to guarantee alignment
+ ; (i.e., these vars are NOT at fixed offsets from esp)
+_STK_OFFS_ = 0
+ ;-----
+ StackVar savRegs,8*4 ;pushad data
+ StackVar retAddr,4 ;return address
+ ;----- caller parameters
+ StackVar ctxPtr ,4 ;context ptr
+ StackVar blkPtr ,4 ;pointer to block data
+ StackVar blkCnt ,4 ;number of full blocks to process
+ StackVar bitAdd ,4 ;bit count to add to tweak
+ ;----- caller's stack frame
+;
+; Notes on stack frame setup:
+; * the most used variable (except for Skein-256) is X_stk[], based at [esp+0]
+; * the next most used is the key schedule words
+; so ebp is "centered" there, allowing short offsets to the key/tweak
+; schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-(
+; * the Wcopy variables are infrequently accessed, and they have long
+; offsets from both esp and ebp only in the 1024-bit case.
+; * all other local vars and calling parameters can be accessed
+; with short offsets, except in the 1024-bit case
+;
+ pushad ;save all regs
+ mov ebx,esp ;keep ebx as pointer to caller parms
+ sub esp,LOCAL_SIZE ;make room for the locals
+ and esp,not 15 ;force alignment
+ mov edi,[ebx+ctxPtr ] ;edi --> Skein context
+ lea ebp,[esp+FRAME_OFFS] ;maximize use of short offsets from ebp
+ mov ecx,ptr32 [ebx+blkCnt] ;keep block cnt in ecx
+;
+endm ;Setup_Stack
+;
+FP_ equ <ebp-FRAME_OFFS> ;keep as many short offsets as possible
+SI_ equ <esi-FRAME_OFFS> ;keep as many short offsets as possible
+ptr64 equ <qword ptr> ;useful abbreviations
+ptr32 equ <dword ptr>
+ptr08 equ <byte ptr>
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro procStart
+ mov esp,ebx ;get rid of locals (wipe??)
+ popad ;restore all regs
+
+ ;display code size in bytes to stdout
+ irp _BCNT_,<%($+1-procStart)> ;account for return opcode
+if _BCNT_ ge 10000 ;(align it all pretty)
+%out procStart code size = _BCNT_ bytes
+elseif _BCNT_ ge 1000
+%out procStart code size = _BCNT_ bytes
+else
+%out procStart code size = _BCNT_ bytes
+endif
+ endm ;irp _BCNT_
+
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+ extrn _Skein_Show_Block:near ;calls to C routines
+ extrn _Skein_Show_Round:near
+;
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+; const u08b_t *blkPtr, const u64b_t *wPtr,
+; const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+ Put_XMM_&BLK_BITS
+ pushad ;save all regs
+ lea eax,[FP_+ksTwk+1] ;+1 = flag: "stride" size = 2 qwords
+ lea esi,[FP_+ksKey+1]
+ lea ecx,[esp+32+Wcopy] ;adjust offset by 32 for pushad
+ mov edx,[ebx+ctxPtr] ;ctx_hdr_ptr
+ lea edx,[edx+X_VARS] ;edx ==> cxt->X[]
+ push eax ;tsPtr
+ push esi ;ksPtr
+ push ecx ;wPtr
+ push ptr32 [ebx+blkPtr] ;blkPtr
+ push edx ;ctx->Xptr
+ push ptr32 [ebx+ctxPtr] ;ctx_hdr_ptr
+ mov eax,BLK_BITS
+ push eax ;bits
+ ifdef _MINGW_
+ call _Skein_Show_Block-4 ;strange linkage??
+ else
+ call _Skein_Show_Block
+ endif
+ add esp,7*4 ;discard parameter space on stack
+ popad ;restore regs
+;
+ Get_XMM_&BLK_BITS
+endm ;Skein_Debug_Block
+
+;
+Skein_Debug_Round macro BLK_BITS,R,saveRegs
+;
+;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+ ifnb <saveRegs>
+ Put_XMM_&BLK_BITS
+ endif
+ pushad ;save all regs
+ if R ne SKEIN_RND_FEED_FWD
+ lea eax,[esp+32+X_stk] ;adjust offset by 32 for pushad
+ else
+ mov eax,[ebx+ctxPtr]
+ add eax,X_VARS
+ endif
+ push eax ;Xptr
+ if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+ mov eax,R
+ else
+ lea eax,[4*edx+1+(((R)-1) and 3)] ;compute round number using edx
+ endif
+ push eax ;round number
+ push ptr32 [ebx+ctxPtr] ;ctx_hdr_ptr
+ mov eax,BLK_BITS
+ push eax ;bits
+ ifdef _MINGW_
+ call _Skein_Show_Round-4 ;strange linkage??
+ else
+ call _Skein_Show_Round
+ endif
+ add esp,4*4 ;discard parameter space on stack
+ popad ;restore regs
+
+ ifnb <saveRegs>
+ Get_XMM_&BLK_BITS ;save internal vars for debug dump
+ endif
+endm ;Skein_Debug_Round
+endif ;ifdef SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+; useful macros
+_ldX macro xn
+ ifnb <xn>
+ mov64 xmm&xn,ptr64 [esp+X_stk+8*xn]
+ endif
+endm
+
+_stX macro xn
+ ifnb <xn>
+ mov64 ptr64 [esp+X_stk+8*xn],xmm&xn
+ endif
+endm
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 256
+ public _Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; Skein-256 round macros
+;
+R_256_OneRound macro _RR_,x0,x1,x2,x3,t0,t1
+ irp _qq_,<%((_RR_) and 7)> ;figure out which rotation constants to use
+ if x0 eq 0
+_RC0_ = RC_256_&_qq_&_0
+_RC1_ = RC_256_&_qq_&_1
+ else
+_RC0_ = RC_256_&_qq_&_1
+_RC1_ = RC_256_&_qq_&_0
+ endif
+ endm
+;
+ paddq xmm&x0,xmm&x1
+ mov64 xmm&t0,xmm&x1
+ psllq xmm&x1, _RC0_
+ psrlq xmm&t0,64-_RC0_
+ xorpd xmm&x1,xmm&x0
+ xorpd xmm&x1,xmm&t0
+;
+ paddq xmm&x2,xmm&x3
+ mov64 xmm&t1,xmm&x3
+ psllq xmm&x3, _RC1_
+ psrlq xmm&t1,64-_RC1_
+ xorpd xmm&x3,xmm&x2
+ xorpd xmm&x3,xmm&t1
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 256,%(_RR_+1),saveRegs
+ endif
+endm ;R_256_OneRound
+;
+R_256_FourRounds macro _RN_
+ R_256_OneRound (_RN_+0),0,1,2,3,4,5
+ R_256_OneRound (_RN_+1),2,1,0,3,4,5
+
+ R_256_OneRound (_RN_+2),0,1,2,3,4,5
+ R_256_OneRound (_RN_+3),2,1,0,3,4,5
+
+ ;inject key schedule
+ inc edx ;bump round number
+ movd xmm4,edx
+ if _UNROLL_CNT eq (ROUNDS_256/8)
+ ;fully unrolled version
+_RK_ = ((_RN_)/4) ;key injection counter
+ paddq xmm0,[FP_+ksKey+16*((_RK_+1) mod 5)]
+ paddq xmm1,[FP_+ksKey+16*((_RK_+2) mod 5)]
+ paddq xmm2,[FP_+ksKey+16*((_RK_+3) mod 5)]
+ paddq xmm3,[FP_+ksKey+16*((_RK_+4) mod 5)]
+ paddq xmm1,[FP_+ksTwk+16*((_RK_+1) mod 3)]
+ paddq xmm2,[FP_+ksTwk+16*((_RK_+2) mod 3)]
+ paddq xmm3,xmm4
+ else ;looping version
+ paddq xmm0,[SI_+ksKey+16*1]
+ paddq xmm1,[SI_+ksKey+16*2]
+ paddq xmm2,[SI_+ksKey+16*3]
+ paddq xmm3,[SI_+ksKey+16*4]
+ paddq xmm1,[SI_+ksTwk+16*1]
+ paddq xmm2,[SI_+ksTwk+16*2]
+ paddq xmm3,xmm4
+;
+ mov64 xmm4,<ptr64 [SI_+ksKey]>;first, "rotate" key schedule on the stack
+ mov64 xmm5,<ptr64 [SI_+ksTwk]>; (for next time through)
+ mov64 <ptr64 [SI_+ksKey+16*(WCNT+1)]>,xmm4
+ mov64 <ptr64 [SI_+ksTwk+16*3]>,xmm5
+ add esi,16 ;bump rolling pointer
+ endif
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs
+ endif
+endm ;R256_FourRounds
+;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_256 equ <call _Put_XMM_256>
+Get_XMM_256 equ <call _Get_XMM_256>
+
+_Put_XMM_256:
+ irp _NN_,<0,1,2,3>
+ mov64 ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_
+ endm
+ ret
+;
+_Get_XMM_256:
+ irp _NN_,<0,1,2,3>
+ mov64 xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8]
+ endm
+ ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+_Skein_256_Process_Block proc near
+ WCNT = 4 ;WCNT=4 for Skein-256
+ Setup_Stack WCNT,ROUNDS_256
+ ; main hash loop for Skein_256
+Skein_256_block_loop:
+ movd xmm4,ptr32 [ebx+bitAdd]
+ mov64 xmm5,ptr64 [edi+TWEAK+0]
+ mov64 xmm6,ptr64 [edi+TWEAK+8]
+ paddq xmm5,xmm4 ;bump T0 by the bitAdd parameter
+ mov64 ptr64 [edi+TWEAK],xmm5 ;save updated tweak value T0 (for next time)
+ movapd xmm7,xmm6
+ xorpd xmm7,xmm5 ;compute overall tweak parity
+ movdqa [FP_+ksTwk ],xmm5 ;save the expanded tweak schedule on the stack
+ movdqa [FP_+ksTwk+16],xmm6
+ movdqa [FP_+ksTwk+32],xmm7
+
+ mov esi,[ebx+blkPtr] ;esi --> input block
+ mov eax,KW_PARITY_LO ;init key schedule parity accumulator
+ mov edx,KW_PARITY_HI
+ movd xmm4,eax
+ movd xmm0,edx
+ unpcklps xmm4,xmm0 ;pack two 32-bit words into xmm4
+;
+ irp _NN_,<0,1,2,3> ;copy in the chaining vars
+ mov64 xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_]
+ xorpd xmm4,xmm&_NN_ ;update overall parity
+ movdqa [FP_+ksKey+16*_NN_],xmm&_NN_
+ endm
+ movdqa [FP_+ksKey+16*WCNT],xmm4;save overall parity at the end of the array
+;
+ paddq xmm1,xmm5 ;inject the initial tweak words
+ paddq xmm2,xmm6
+;
+ irp _NN_,<0,1,2,3> ;perform the initial key injection
+ mov64 xmm4,ptr64 [esi+8*_NN_] ;and save a copy of the input block on stack
+ mov64 ptr64 [esp+8*_NN_+Wcopy],xmm4
+ paddq xmm&_NN_,xmm4
+ endm
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block 256
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+ add esi, WCNT*8 ;skip to the next block
+ mov [ebx+blkPtr ],esi ;save the updated block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+ xor edx,edx ;edx = iteration count
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT = ROUNDS_256/8 ;fully unrolled
+else
+_UNROLL_CNT = SKEIN_UNROLL_256 ;partial unroll count
+ if ((ROUNDS_256/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_256" ;sanity check
+ endif
+ mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_256_round_loop: ; (since there's no 16* scaled address mode)
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2 ; here with X[0..3] in XMM0..XMM3
+ R_256_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT*2
+;
+ if _UNROLL_CNT ne (ROUNDS_256/8)
+ cmp edx,2*(ROUNDS_256/8)
+ jb Skein_256_round_loop
+ endif
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ irp _NN_,<0,1,2,3>
+ mov64 xmm4,ptr64 [esp+Wcopy+8*_NN_]
+ xorpd xmm&_NN_,xmm4
+ mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+ endm
+ and ptr08 [edi +TWEAK +15],FIRST_MASK8
+if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,saveRegs
+endif
+ ; go back for more blocks, if needed
+ dec ecx
+ jnz Skein_256_block_loop
+
+ Reset_Stack _Skein_256_Process_Block
+ ret
+;
+_Skein_256_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein_256_Process_Block_CodeSize
+_Skein_256_Process_Block_CodeSize proc
+ mov eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block
+ ret
+_Skein_256_Process_Block_CodeSize endp
+;
+ public _Skein_256_Unroll_Cnt
+_Skein_256_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_256/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein_256_Unroll_Cnt endp
+endif
+endif ;_USE_ASM_ and 256
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 512
+ public _Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round
+;
+R_512_Round macro _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd
+irp _nr_,<%((_RR_) and 7)>
+_Ra_ = RC_512_&_nr_&_&Ra
+_Rb_ = RC_512_&_nr_&_&Rb
+_Rc_ = RC_512_&_nr_&_&Rc
+_Rd_ = RC_512_&_nr_&_&Rd
+endm
+ paddq xmm&a0,xmm&a1
+ _stX c0
+ mov64 xmm&c0,xmm&a1
+ psllq xmm&a1, _Ra_
+ psrlq xmm&c0,64-_Ra_
+ xorpd xmm&a1,xmm&c0
+ xorpd xmm&a1,xmm&a0
+
+ paddq xmm&b0,xmm&b1
+ _stX a0
+ mov64 xmm&a0,xmm&b1
+ psllq xmm&b1, _Rb_
+ psrlq xmm&a0,64-_Rb_
+ xorpd xmm&b1,xmm&b0
+ _ldX c0
+ xorpd xmm&b1,xmm&a0
+
+ paddq xmm&c0,xmm&c1
+ mov64 xmm&a0,xmm&c1
+ psllq xmm&c1, _Rc_
+ psrlq xmm&a0,64-_Rc_
+ xorpd xmm&c1,xmm&c0
+ xorpd xmm&c1,xmm&a0
+
+ paddq xmm&d0,xmm&d1
+ mov64 xmm&a0,xmm&d1
+ psllq xmm&d1, _Rd_
+ psrlq xmm&a0,64-_Rd_
+ xorpd xmm&d1,xmm&a0
+ _ldX a0
+ xorpd xmm&d1,xmm&d0
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 512,%(_RR_+1),saveRegs
+ endif
+endm
+;
+; MACRO: four rounds
+R_512_FourRounds macro _RN_
+ R_512_Round (_RN_) , 0,1,0, 2,3,1, 4,5,2, 6,7,3
+ R_512_Round (_RN_)+1, 2,1,0, 4,7,1, 6,5,2, 0,3,3
+ R_512_Round (_RN_)+2, 4,1,0, 6,3,1, 0,5,2, 2,7,3
+ R_512_Round (_RN_)+3, 6,1,0, 0,7,1, 2,5,2, 4,3,3
+
+ ;inject key schedule
+ irp _NN_,<0,1,2,3,4,5,6,7>
+ if _UNROLL_CNT eq (ROUNDS_512/8)
+ paddq xmm&_NN_,[FP_+ksKey+16*((((_RN_)/4)+(_NN_)+1) mod 9)]
+ else
+ paddq xmm&_NN_,[SI_+ksKey+16*((_NN_)+1)]
+ endif
+ endm
+ _stX 0 ;free up a register
+ inc edx ;bump round counter
+ movd xmm0,edx ;inject the tweak
+ if _UNROLL_CNT eq (ROUNDS_512/8)
+ paddq xmm5,[FP_+ksTwk+16*(((_RN_)+1) mod 3)]
+ paddq xmm6,[FP_+ksTwk+16*(((_RN_)+2) mod 3)]
+ paddq xmm7,xmm0
+ else ;looping version
+ paddq xmm5,[SI_+ksTwk+16*1]
+ paddq xmm6,[SI_+ksTwk+16*2]
+ paddq xmm7,xmm0
+;
+ mov64 xmm0,<ptr64 [SI_+ksKey]>;first, "rotate" key schedule on the stack
+ mov64 <ptr64 [SI_+ksKey+16*(WCNT+1)]>,xmm0
+ mov64 xmm0,<ptr64 [SI_+ksTwk]>; (for next time through)
+ mov64 <ptr64 [SI_+ksTwk+16*3]>,xmm0
+ add esi,16 ;bump rolling pointer
+ endif
+ _ldX 0 ;restore X0
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,saveRegs
+ endif
+endm ;R_512_FourRounds
+;;;;;;;;;;;;;;;;;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_512 equ <call _Put_XMM_512>
+Get_XMM_512 equ <call _Get_XMM_512>
+
+_Put_XMM_512:
+ irp _NN_,<0,1,2,3,4,5,6,7>
+ mov64 ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_
+ endm
+ ret
+;
+_Get_XMM_512:
+ irp _NN_,<0,1,2,3,4,5,6,7>
+ mov64 xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8]
+ endm
+ ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein_512_Process_Block proc near
+ WCNT = 8 ;WCNT=8 for Skein-512
+ Setup_Stack WCNT,ROUNDS_512
+ ; main hash loop for Skein_512
+Skein_512_block_loop:
+ movd xmm0,ptr32 [ebx+bitAdd]
+ mov64 xmm1,ptr64 [edi+TWEAK+0]
+ mov64 xmm2,ptr64 [edi+TWEAK+8]
+ paddq xmm1,xmm0 ;bump T0 by the bitAdd parameter
+ mov64 ptr64 [edi+TWEAK],xmm1 ;save updated tweak value T0 (for next time)
+ mov64 xmm0,xmm2
+ xorpd xmm0,xmm1 ;compute overall tweak parity
+ movdqa [FP_+ksTwk ],xmm1 ;save the expanded tweak schedule on the stack
+ movdqa [FP_+ksTwk+16*1],xmm2
+ movdqa [FP_+ksTwk+16*2],xmm0
+
+ mov esi,[ebx+blkPtr] ;esi --> input block
+ mov eax,KW_PARITY_LO ;init key schedule parity accumulator
+ mov edx,KW_PARITY_HI
+ movd xmm0,eax
+ movd xmm7,edx
+ unpcklps xmm0,xmm7 ;pack two 32-bit words into xmm0
+;
+ irp _NN_,<7,6,5,4,3,2,1> ;copy in the chaining vars (skip #0 for now)
+ mov64 xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_]
+ xorpd xmm0,xmm&_NN_ ;update overall parity
+ movdqa [FP_+ksKey+16*_NN_],xmm&_NN_
+ if _NN_ eq 5
+ paddq xmm5,xmm1 ;inject the initial tweak words
+ paddq xmm6,xmm2 ; (before they get trashed in xmm1/2)
+ endif
+ endm
+ mov64 xmm4,ptr64 [edi+X_VARS] ;handle #0 now
+ xorpd xmm0,xmm4 ;update overall parity
+ movdqa [FP_+ksKey+16* 0 ],xmm4;save the key value in slot #0
+ movdqa [FP_+ksKey+16*WCNT],xmm0;save overall parity at the end of the array
+;
+ mov64 xmm0,xmm4
+ irp _NN_,<7,6,5, 4,3,2,1,0> ;perform the initial key injection (except #4)
+ mov64 xmm4,ptr64 [esi+ 8*_NN_];and save a copy of the input block on stack
+ mov64 ptr64 [esp+ 8*_NN_+Wcopy],xmm4
+ paddq xmm&_NN_,xmm4
+ endm
+ mov64 xmm4,ptr64 [esi+ 8*4] ;get input block word #4
+ mov64 ptr64 [esp+ 8*4+Wcopy],xmm4
+ paddq xmm4,[FP_+ksKey+16*4] ;inject the initial key
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block 512
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+ add esi, WCNT*8 ;skip to the next block
+ mov [ebx+blkPtr],esi ;save the updated block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+ xor edx,edx ;edx = round counter
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT = ROUNDS_512/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ if ((ROUNDS_512/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_512"
+ endif
+ mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_512_round_loop: ; (since there's no 16* scaled address mode)
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ R_512_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+ cmp edx,2*(ROUNDS_512/8)
+ jb Skein_512_round_loop
+endif
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+ and ptr08 [edi +TWEAK +15],FIRST_MASK8
+irp _NN_,<0,2,4,6> ;do the aligned ones first
+ xorpd xmm&_NN_,[esp+Wcopy+8*_NN_]
+ mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+endm
+irp _NN_,<1,3,5,7> ;now we have some register space available
+ mov64 xmm0,ptr64 [esp+Wcopy+8*_NN_]
+ xorpd xmm&_NN_,xmm0
+ mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+endif
+ ; go back for more blocks, if needed
+ dec ecx
+ jnz Skein_512_block_loop
+
+ Reset_Stack _Skein_512_Process_Block
+ ret
+_Skein_512_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein_512_Process_Block_CodeSize
+_Skein_512_Process_Block_CodeSize proc
+ mov eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block
+ ret
+_Skein_512_Process_Block_CodeSize endp
+;
+ public _Skein_512_Unroll_Cnt
+_Skein_512_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_512/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 1024
+ public _Skein1024_Process_Block
+;
+; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+R_1024_REGS equ (5) ;keep this many block variables in registers
+;
+;;;;;;;;;;;;;;;;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_1024 equ <call _Put_XMM_1024>
+Get_XMM_1024 equ <call _Get_XMM_1024>
+
+_Put_XMM_1024:
+_NN_ = 0
+ rept R_1024_REGS
+ irp _rr_,<%(_NN_)>
+ mov64 ptr64 [esp+X_stk+4+8*_NN_],xmm&_rr_
+ endm
+_NN_ = _NN_+1
+ endm
+ ret
+;
+_Get_XMM_1024:
+_NN_ = 0
+ rept R_1024_REGS
+ irp _rr_,<%(_NN_)>
+ mov64 xmm&_rr_,ptr64 [esp+X_stk+4+8*_NN_]
+ endm
+_NN_ = _NN_+1
+ endm
+ ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one mix step
+MixStep_1024 macro x0,x1,rotIdx0,rotIdx1,_debug_
+_r0_ = x0 ;default, if already loaded
+_r1_ = x1
+ ; load the regs (if necessary)
+ if (x0 ge R_1024_REGS)
+_r0_ = 5
+ mov64 xmm5,ptr64 [esp+X_stk+8*(x0)]
+ endif
+ if (x1 ge R_1024_REGS)
+_r1_ = 6
+ mov64 xmm6,ptr64 [esp+X_stk+8*(x1)]
+ endif
+ ; do the mix
+ irp _rx_,<%((rotIdx0) and 7)>
+_Rc_ = RC_1024_&_rx_&_&rotIdx1 ;rotation constant
+ endm
+ irp _x0_,<%_r0_>
+ irp _x1_,<%_r1_>
+ paddq xmm&_x0_,xmm&_x1_
+ mov64 xmm7 ,xmm&_x1_
+ psllq xmm&_x1_, _Rc_
+ psrlq xmm7 ,64-_Rc_
+ xorpd xmm&_x1_,xmm&_x0_
+ xorpd xmm&_x1_,xmm7
+ endm
+ endm
+ ; save the regs (if necessary)
+ if (x0 ge R_1024_REGS)
+ mov64 ptr64 [esp+X_stk+8*(x0)],xmm5
+ endif
+ if (x1 ge R_1024_REGS)
+ mov64 ptr64 [esp+X_stk+8*(x1)],xmm6
+ endif
+ ; debug output
+ if _SKEIN_DEBUG and (0 ne (_debug_ + 0))
+ Skein_Debug_Round 1024,%((RotIdx0)+1),saveRegs
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_1024_FourRounds macro _RR_
+ ;--------- round _RR_
+ MixStep_1024 0, 1,%((_RR_)+0),0
+ MixStep_1024 2, 3,%((_RR_)+0),1
+ MixStep_1024 4, 5,%((_RR_)+0),2
+ MixStep_1024 6, 7,%((_RR_)+0),3
+ MixStep_1024 8, 9,%((_RR_)+0),4
+ MixStep_1024 10,11,%((_RR_)+0),5
+ MixStep_1024 12,13,%((_RR_)+0),6
+ MixStep_1024 14,15,%((_RR_)+0),7,1
+ ;--------- round _RR_+1
+ MixStep_1024 0, 9,%((_RR_)+1),0
+ MixStep_1024 2,13,%((_RR_)+1),1
+ MixStep_1024 6,11,%((_RR_)+1),2
+ MixStep_1024 4,15,%((_RR_)+1),3
+ MixStep_1024 10, 7,%((_RR_)+1),4
+ MixStep_1024 12, 3,%((_RR_)+1),5
+ MixStep_1024 14, 5,%((_RR_)+1),6
+ MixStep_1024 8, 1,%((_RR_)+1),7,1
+ ;--------- round _RR_+2
+ MixStep_1024 0, 7,%((_RR_)+2),0
+ MixStep_1024 2, 5,%((_RR_)+2),1
+ MixStep_1024 4, 3,%((_RR_)+2),2
+ MixStep_1024 6, 1,%((_RR_)+2),3
+ MixStep_1024 12,15,%((_RR_)+2),4
+ MixStep_1024 14,13,%((_RR_)+2),5
+ MixStep_1024 8,11,%((_RR_)+2),6
+ MixStep_1024 10, 9,%((_RR_)+2),7,1
+ ;--------- round _RR_+3
+ MixStep_1024 0,15,%((_RR_)+3),0
+ MixStep_1024 2,11,%((_RR_)+3),1
+ MixStep_1024 6,13,%((_RR_)+3),2
+ MixStep_1024 4, 9,%((_RR_)+3),3
+ MixStep_1024 14, 1,%((_RR_)+3),4
+ MixStep_1024 8, 5,%((_RR_)+3),5
+ MixStep_1024 10, 3,%((_RR_)+3),6
+ MixStep_1024 12, 7,%((_RR_)+3),7,1
+
+ inc edx ;edx = round number
+ movd xmm7,edx
+ ;inject the key
+irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0>
+ if _UNROLL_CNT ne (ROUNDS_1024/8)
+ if _NN_ lt R_1024_REGS
+ paddq xmm&_NN_,ptr64 [SI_+ksKey+16*_NN_+16]
+ else
+ mov64 xmm6 ,ptr64 [esp+X_stk+ 8*_NN_]
+ if _NN_ eq 15
+ paddq xmm6,xmm7
+ elseif _NN_ eq 14
+ paddq xmm6,ptr64 [SI_+ksTwk+16*2]
+ elseif _NN_ eq 13
+ paddq xmm6,ptr64 [SI_+ksTwk+16*1]
+ endif
+ paddq xmm6 ,ptr64 [SI_+ksKey+16*_NN_+16]
+ mov64 ptr64 [esp+X_stk+ 8*_NN_],xmm6
+ endif
+ else
+ if _NN_ lt R_1024_REGS
+ paddq xmm&_NN_,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)]
+ else
+ mov64 xmm6,ptr64 [esp+X_stk+ 8*_NN_]
+ paddq xmm6,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)]
+ if _NN_ eq 15
+ paddq xmm6,xmm7
+ elseif _NN_ eq 14
+ paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+2) mod 3)]
+ elseif _NN_ eq 13
+ paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+1) mod 3)]
+ endif
+ mov64 ptr64 [esp+X_stk+ 8*_NN_],xmm6
+ endif
+ endif
+endm
+if _UNROLL_CNT ne (ROUNDS_1024/8) ;rotate the key schedule on the stack
+ mov64 xmm6,ptr64 [SI_+ksKey]
+ mov64 xmm7,ptr64 [SI_+ksTwk]
+ mov64 ptr64 [SI_+ksKey+16*(WCNT+1)],xmm6
+ mov64 ptr64 [SI_+ksTwk+16* 3 ],xmm7
+ add esi,16 ;bump rolling pointer
+endif
+if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs
+endif
+endm ;R_1024_FourRounds
+;;;;;;;;;;;;;;;;
+; code
+;
+_Skein1024_Process_Block proc near
+;
+ WCNT = 16 ;WCNT=16 for Skein-1024
+ Setup_Stack WCNT,ROUNDS_1024
+ add edi,80h ;bias the edi ctxt offsets to keep them all short
+ctx equ <edi-80h> ;offset alias
+ ; main hash loop for Skein1024
+Skein1024_block_loop:
+ movd xmm0,ptr32 [ebx+bitAdd]
+ mov64 xmm1,ptr64 [ctx+TWEAK+0]
+ mov64 xmm2,ptr64 [ctx+TWEAK+8]
+ paddq xmm1,xmm0 ;bump T0 by the bitAdd parameter
+ mov64 ptr64 [ctx+TWEAK],xmm1 ;save updated tweak value T0 (for next time)
+ mov64 xmm0,xmm2
+ xorpd xmm0,xmm1 ;compute overall tweak parity
+ movdqa [FP_+ksTwk ],xmm1 ;save the expanded tweak schedule on the stack
+ movdqa [FP_+ksTwk+16],xmm2
+ movdqa [FP_+ksTwk+32],xmm0
+
+ mov esi,[ebx+blkPtr] ;esi --> input block
+ mov eax,KW_PARITY_LO ;init key schedule parity accumulator
+ mov edx,KW_PARITY_HI
+ movd xmm7,eax
+ movd xmm6,edx
+ unpcklps xmm7,xmm6 ;pack two 32-bit words into xmm7
+;
+ lea eax,[esp+80h] ;use short offsets for Wcopy, X_stk writes below
+SP_ equ <eax-80h> ;[eax+OFFS] mode is one byte shorter than [esp+OFFS]
+irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0>
+ mov64 xmm6,ptr64 [ctx+X_VARS+8*_NN_]
+ xorpd xmm7,xmm6 ;update overall parity
+ movdqa [FP_+ksKey+16*_NN_],xmm6;save the key schedule on the stack
+ if _NN_ lt R_1024_REGS
+ _rr_ = _NN_
+ else
+ _rr_ = R_1024_REGS
+ endif
+ irp _rn_,<%(_rr_)>
+ mov64 xmm&_rn_,ptr64 [esi+ 8*_NN_];save copy of the input block on stack
+ mov64 ptr64 [SP_+ Wcopy + 8*_NN_],xmm&_rn_ ;(for feedforward later)
+ paddq xmm&_rn_,xmm6 ;inject the key into the block
+ if _NN_ eq 13
+ paddq xmm&_rn_,xmm1 ;inject the initial tweak words
+ elseif _NN_ eq 14
+ paddq xmm&_rn_,xmm2
+ endif
+ if _NN_ ge R_1024_REGS ;only save X[5..15] on stack, leave X[0..4] in regs
+ mov64 ptr64 [SP_+X_stk+8*_NN_],xmm&_rn_
+ endif
+ endm
+endm
+ movdqa [FP_+ksKey+16*WCNT],xmm7;save overall key parity at the end of the array
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block 1024
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+ add esi, WCNT*8 ;skip to the next block
+ mov [ebx+blkPtr],esi ;save the updated block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+ xor edx,edx ;edx = round counter
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT = ROUNDS_1024/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_1024"
+ endif
+ mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_1024_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ R_1024_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+ cmp edx,2*(ROUNDS_1024/8)
+ jb Skein_1024_round_loop
+endif
+ and ptr08 [ctx +TWEAK +15],FIRST_MASK8 ;clear tweak bit for next time thru
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ lea eax,[esp+80h] ;allow short offsets to X_stk and Wcopy
+irp _NN_,<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+ if _NN_ lt R_1024_REGS
+ if _NN_ and 1 ;already in regs: no load needed
+ mov64 xmm7 ,ptr64 [SP_+ Wcopy + 8*_NN_] ;unaligned
+ xorpd xmm&_NN_,xmm7
+ else
+ xorpd xmm&_NN_, [SP_+ Wcopy + 8*_NN_] ;aligned
+ endif
+ mov64 ptr64 [ctx+ X_vars+ 8*_NN_],xmm&_NN_
+ else
+ mov64 xmm7,ptr64 [SP_+ X_stk + 8*_NN_] ;load X value from stack
+ if _NN_ and 1
+ mov64 xmm6,ptr64 [SP_+ Wcopy + 8*_NN_] ;unaligned
+ xorpd xmm7,xmm6
+ else
+ xorpd xmm7, [SP_+ Wcopy + 8*_NN_] ;aligned
+ endif
+ mov64 ptr64 [ctx+ X_vars+ 8*_NN_],xmm7
+ endif
+endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD ;no need to save regs on stack here
+endif
+ ; go back for more blocks, if needed
+ dec ecx
+ jnz Skein1024_block_loop
+
+ Reset_Stack _Skein1024_Process_Block
+ ret
+_Skein1024_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein1024_Process_Block_CodeSize
+_Skein1024_Process_Block_CodeSize proc
+ mov eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block
+ ret
+_Skein1024_Process_Block_CodeSize endp
+;
+ public _Skein1024_Unroll_Cnt
+_Skein1024_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_1024/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;----------------------------------------------------------------
+ end