aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Additional_Implementations/Atmel_AVR.c77
-rw-r--r--Additional_Implementations/skein_8bit_estimates.xlsbin0 -> 26112 bytes
-rw-r--r--Additional_Implementations/skein_MSC_v9_perf.txt129
-rw-r--r--Additional_Implementations/skein_block_x64.asm1335
-rw-r--r--Additional_Implementations/skein_block_x64.s1328
-rw-r--r--Additional_Implementations/skein_block_x86.asm1180
-rw-r--r--Additional_Implementations/skein_block_xmm32.asm1167
-rw-r--r--Additional_Implementations/skein_block_xmm32.s1110
-rw-r--r--Additional_Implementations/skein_perf_core2.txt1440
-rw-r--r--Additional_Implementations/skein_rot_search2.c2538
-rw-r--r--Additional_Implementations/skein_test.c1380
-rw-r--r--Optimized_32bit/SHA3api_ref.c115
-rw-r--r--Optimized_32bit/SHA3api_ref.h66
-rw-r--r--Optimized_32bit/brg_endian.h148
-rw-r--r--Optimized_32bit/brg_types.h188
-rw-r--r--Optimized_32bit/skein.c753
-rw-r--r--Optimized_32bit/skein.h327
-rw-r--r--Optimized_32bit/skein_block.c689
-rw-r--r--Optimized_32bit/skein_debug.c247
-rw-r--r--Optimized_32bit/skein_debug.h48
-rw-r--r--Optimized_32bit/skein_iv.h199
-rw-r--r--Optimized_32bit/skein_port.h124
-rw-r--r--Optimized_64bit/SHA3api_ref.c115
-rw-r--r--Optimized_64bit/SHA3api_ref.h66
-rw-r--r--Optimized_64bit/brg_endian.h148
-rw-r--r--Optimized_64bit/brg_types.h188
-rw-r--r--Optimized_64bit/skein.c753
-rw-r--r--Optimized_64bit/skein.h327
-rw-r--r--Optimized_64bit/skein_block.c689
-rw-r--r--Optimized_64bit/skein_debug.c247
-rw-r--r--Optimized_64bit/skein_debug.h48
-rw-r--r--Optimized_64bit/skein_iv.h199
-rw-r--r--Optimized_64bit/skein_port.h124
-rw-r--r--README/readme.txt166
-rw-r--r--Reference_Implementation/SHA3api_ref.c115
-rw-r--r--Reference_Implementation/SHA3api_ref.h66
-rw-r--r--Reference_Implementation/brg_endian.h148
-rw-r--r--Reference_Implementation/brg_types.h188
-rw-r--r--Reference_Implementation/skein.c747
-rw-r--r--Reference_Implementation/skein.h327
-rw-r--r--Reference_Implementation/skein_block.c369
-rw-r--r--Reference_Implementation/skein_debug.c247
-rw-r--r--Reference_Implementation/skein_debug.h48
-rw-r--r--Reference_Implementation/skein_port.h44
-rw-r--r--Supporting_Documentation/Skein Cover Sheet.pdfbin0 -> 44070 bytes
-rw-r--r--Supporting_Documentation/Skein_Implementation_Statement.pdfbin0 -> 43116 bytes
-rw-r--r--Supporting_Documentation/Skein_Submitter_Statement.pdfbin0 -> 22471 bytes
-rw-r--r--Supporting_Documentation/skein1.3.pdfbin0 -> 479368 bytes
-rw-r--r--Supporting_Documentation/skeinround3Mods.pdfbin0 -> 33906 bytes
-rw-r--r--Supporting_Documentation/tex/key_recover.pdfbin0 -> 5877 bytes
-rw-r--r--Supporting_Documentation/tex/reverserounds256.pdfbin0 -> 7745 bytes
-rw-r--r--Supporting_Documentation/tex/skein-21.mps161
-rw-r--r--Supporting_Documentation/tex/skein-22.mps832
-rw-r--r--Supporting_Documentation/tex/skein-23.mps327
-rw-r--r--Supporting_Documentation/tex/skein-24.mps398
-rw-r--r--Supporting_Documentation/tex/skein-25.mps1440
-rw-r--r--Supporting_Documentation/tex/skein-31.mps161
-rw-r--r--Supporting_Documentation/tex/skein-32.mps812
-rw-r--r--Supporting_Documentation/tex/skein-33.mps1384
-rw-r--r--Supporting_Documentation/tex/skein-41.mps349
-rw-r--r--Supporting_Documentation/tex/skein-42.mps163
-rw-r--r--Supporting_Documentation/tex/skein-51.mps200
-rw-r--r--Supporting_Documentation/tex/skein-52.mps334
-rw-r--r--Supporting_Documentation/tex/skein-53.mps259
-rw-r--r--Supporting_Documentation/tex/skein-61.mps247
-rw-r--r--Supporting_Documentation/tex/skein-71.mps90
-rw-r--r--Supporting_Documentation/tex/skein-81.mps279
-rw-r--r--Supporting_Documentation/tex/skein1.3.tex4025
-rw-r--r--Supporting_Documentation/tex/skeinround3Mods.tex76
69 files changed, 31494 insertions, 0 deletions
diff --git a/Additional_Implementations/Atmel_AVR.c b/Additional_Implementations/Atmel_AVR.c
new file mode 100644
index 000000000000..11cfdd8d74f8
--- /dev/null
+++ b/Additional_Implementations/Atmel_AVR.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include "skein.h"
+
+#define SKEIN_CODE_SIZE (1) /* instantiate code size routines */
+#define SKEIN_LOOP (111) /* unroll only 8 rounds */
+#define SKEIN_USE_ASM (512+1024) /* what to exclude here */
+#include "skein.c"
+#include "skein_block.c"
+
+/* for code size limitations, make "dummy" versions of unused block functions */
+#if SKEIN_USE_ASM & 256
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 512
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 1024
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+
+const u08b_t msg[1] =
+ {
+ 0
+ };
+
+int main(int argc,char *argv[])
+ {
+ u08b_t hash[1024/8];
+ u08b_t i,x;
+ static size_t aBytes,bBytes,uCount;
+
+#if !(SKEIN_USE_ASM & 256)
+ Skein_256_Ctxt_t ctx;
+
+ aBytes = 2*Skein_256_API_CodeSize();
+ bBytes = 2*Skein_256_Process_Block_CodeSize();
+ uCount = Skein_256_Unroll_Cnt();
+
+ Skein_256_Init (&ctx,256);
+ Skein_256_Update(&ctx,msg,sizeof(msg));
+ Skein_256_Final (&ctx,hash);
+
+ Skein_256_Process_Block(&ctx,msg,1,256);
+#endif
+
+#if !(SKEIN_USE_ASM & 512)
+ Skein_512_Ctxt_t ctx;
+
+ aBytes = 2*Skein_512_API_CodeSize();
+ bBytes = 2*Skein_512_Process_Block_CodeSize();
+ uCount = Skein_512_Unroll_Cnt();
+
+ Skein_512_Init (&ctx,512);
+ Skein_512_Update(&ctx,msg,sizeof(msg));
+ Skein_512_Final (&ctx,hash);
+
+ Skein_512_Process_Block(&ctx,msg,1,512);
+#endif
+
+#if !(SKEIN_USE_ASM & 1024)
+ Skein1024_Ctxt_t ctx;
+
+ aBytes = 2*Skein1024_API_CodeSize();
+ bBytes = 2*Skein1024_Process_Block_CodeSize();
+ uCount = Skein1024_Unroll_Cnt();
+
+ Skein1024_Init (&ctx,1024);
+ Skein1024_Update(&ctx,msg,sizeof(msg));
+ Skein1024_Final (&ctx,hash);
+
+ Skein1024_Process_Block(&ctx,msg,1,1024);
+#endif
+ printf("API size = %4d bytes. Block size = %4d bytes. Unroll=%d\n",
+ aBytes,bBytes,uCount);
+ for (i=x=0;i<5;i++)
+ printf("hash[%d] = %02X [%02X]\n",i,hash[i],x ^= hash[i]);
+ }
diff --git a/Additional_Implementations/skein_8bit_estimates.xls b/Additional_Implementations/skein_8bit_estimates.xls
new file mode 100644
index 000000000000..ecc66a28f205
--- /dev/null
+++ b/Additional_Implementations/skein_8bit_estimates.xls
Binary files differ
diff --git a/Additional_Implementations/skein_MSC_v9_perf.txt b/Additional_Implementations/skein_MSC_v9_perf.txt
new file mode 100644
index 000000000000..9e8f125a45c5
--- /dev/null
+++ b/Additional_Implementations/skein_MSC_v9_perf.txt
@@ -0,0 +1,129 @@
+File STDIN:
+ 1_ || 2802.00 2814.00 | 5952.00 5952.00 | 30606.00 30606.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 10_ || 278.40 278.40 | 593.40 593.40 | 3063.00 3063.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 100_ || 65.52 65.58 | 88.02 88.08 | 306.30 306.30 | //: 32-bit, MSC_v9.00 [ C =...]
+ 1000_ || 41.26 41.41 | 47.96 47.96 | 135.28 135.29 | //: 32-bit, MSC_v9.00 [ C =...]
+ 10000_ || 38.86 39.08 | 44.13 44.21 | 119.88 120.11 | //: 32-bit, MSC_v9.00 [ C =...]
+ 100000_ || 38.85 39.09 | 43.56 43.77 | 105.79 114.18 | //: 32-bit, MSC_v9.00 [ C =...]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+ Block || 10192 bytes | 22960 bytes | 53072 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+ 1_ || 780.00 786.00 | 1110.00 1110.00 | 3288.00 3318.00 | //: 64-bit, MSC_v9.00 [ C =...]
+ 10_ || 78.60 79.80 | 109.80 109.80 | 331.20 331.80 | //: 64-bit, MSC_v9.00 [ C =...]
+ 100_ || 16.74 16.80 | 15.54 15.54 | 33.30 33.30 | //: 64-bit, MSC_v9.00 [ C =...]
+ 1000_ || 9.88 10.67 | 7.38 7.38 | 14.16 14.17 | //: 64-bit, MSC_v9.00 [ C =...]
+ 10000_ || 9.21 9.22 | 6.60 6.60 | 12.27 12.39 | //: 64-bit, MSC_v9.00 [ C =...]
+ 100000_ || 9.98 10.01 | 7.04 7.08 | 12.36 13.14 | //: 64-bit, MSC_v9.00 [ C =...]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+ Block || 2272 bytes | 4944 bytes | 15264 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+ 1_ || 2484.00 2490.00 | 4830.00 4836.00 | 22182.00 22188.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 10_ || 250.20 252.00 | 485.40 488.40 | 1936.80 1959.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 100_ || 58.62 58.68 | 70.74 70.80 | 221.76 221.76 | //: 32-bit, MSC_v9.00 [asm=...]
+ 1000_ || 34.12 34.16 | 35.44 35.44 | 85.27 85.31 | //: 32-bit, MSC_v9.00 [asm=...]
+ 10000_ || 34.78 34.98 | 35.36 35.36 | 86.31 86.35 | //: 32-bit, MSC_v9.00 [asm=...]
+ 100000_ || 32.96 33.40 | 33.29 33.60 | 75.79 76.81 | //: 32-bit, MSC_v9.00 [asm=...]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+ Block || 7588 bytes | 16636 bytes | 38262 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+ 1_ || 672.00 672.00 | 1068.00 1068.00 | 1920.00 1926.00 | //: 64-bit, MSC_v9.00 [asm=...]
+ 10_ || 64.80 65.40 | 107.40 108.00 | 192.00 192.60 | //: 64-bit, MSC_v9.00 [asm=...]
+ 100_ || 15.54 15.60 | 16.20 16.26 | 21.06 21.06 | //: 64-bit, MSC_v9.00 [asm=...]
+ 1000_ || 8.18 8.18 | 6.97 6.97 | 7.77 7.78 | //: 64-bit, MSC_v9.00 [asm=...]
+ 10000_ || 7.59 7.59 | 6.23 6.23 | 6.69 6.69 | //: 64-bit, MSC_v9.00 [asm=...]
+ 100000_ || 7.55 7.71 | 6.14 6.38 | 6.56 6.86 | //: 64-bit, MSC_v9.00 [asm=...]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+ Block || 2323 bytes | 4733 bytes | 11817 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+ 1_ || 2952.00 2958.00 | 6030.00 6036.00 | 13668.00 13674.00 | //: 32-bit, MSC_v9.00 [ C =111]
+ 10_ || 295.80 295.80 | 603.00 603.60 | 1366.80 1366.80 | //: 32-bit, MSC_v9.00 [ C =111]
+ 100_ || 69.96 70.02 | 88.98 89.04 | 136.92 137.52 | //: 32-bit, MSC_v9.00 [ C =111]
+ 1000_ || 43.90 43.96 | 48.78 48.85 | 60.08 60.11 | //: 32-bit, MSC_v9.00 [ C =111]
+ 10000_ || 41.53 41.59 | 44.76 44.80 | 53.01 53.01 | //: 32-bit, MSC_v9.00 [ C =111]
+ 100000_ || 41.32 41.60 | 44.52 44.62 | 51.75 51.92 | //: 32-bit, MSC_v9.00 [ C =111]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+ Block || 1712 bytes | 3664 bytes | 7200 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+ 1_ || 780.00 786.00 | 1422.00 1434.00 | 3810.00 3816.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 10_ || 75.60 76.20 | 140.40 140.40 | 380.40 381.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 100_ || 17.16 17.22 | 20.52 21.00 | 38.22 38.28 | //: 64-bit, MSC_v9.00 [ C =111]
+ 1000_ || 9.69 9.69 | 10.42 10.42 | 16.51 16.51 | //: 64-bit, MSC_v9.00 [ C =111]
+ 10000_ || 8.97 8.97 | 9.38 9.38 | 14.38 14.40 | //: 64-bit, MSC_v9.00 [ C =111]
+ 100000_ || 9.18 9.71 | 9.35 9.49 | 14.79 14.99 | //: 64-bit, MSC_v9.00 [ C =111]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+ Block || 704 bytes | 1456 bytes | 2976 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+ 1_ || 2580.00 2598.00 | 4842.00 4848.00 | 10578.00 10602.00 | //: 32-bit, MSC_v9.00 [asm=111]
+ 10_ || 259.80 259.80 | 484.20 484.20 | 1059.60 1060.20 | //: 32-bit, MSC_v9.00 [asm=111]
+ 100_ || 57.18 57.24 | 66.42 66.48 | 98.40 98.46 | //: 32-bit, MSC_v9.00 [asm=111]
+ 1000_ || 35.56 35.59 | 35.96 35.96 | 42.79 42.80 | //: 32-bit, MSC_v9.00 [asm=111]
+ 10000_ || 33.69 36.50 | 33.29 33.42 | 37.98 41.34 | //: 32-bit, MSC_v9.00 [asm=111]
+ 100000_ || 33.96 34.57 | 33.93 35.69 | 38.04 38.20 | //: 32-bit, MSC_v9.00 [asm=111]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+ Block || 1276 bytes | 2532 bytes | 4983 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+ 1_ || 678.00 678.00 | 1098.00 1098.00 | 2034.00 2040.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 10_ || 66.60 66.60 | 109.80 109.80 | 204.00 204.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 100_ || 15.48 16.68 | 16.98 16.98 | 22.38 22.38 | //: 64-bit, MSC_v9.00 [asm=111]
+ 1000_ || 8.45 8.45 | 7.93 7.93 | 8.39 8.39 | //: 64-bit, MSC_v9.00 [asm=111]
+ 10000_ || 7.81 7.81 | 6.50 6.50 | 7.18 7.18 | //: 64-bit, MSC_v9.00 [asm=111]
+ 100000_ || 8.08 8.09 | 6.40 6.71 | 6.98 7.21 | //: 64-bit, MSC_v9.00 [asm=111]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+ Block || 664 bytes | 1074 bytes | 2221 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+ 1_ || 2988.00 2994.00 | 6240.00 6246.00 | 13794.00 13800.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 10_ || 297.60 299.40 | 623.40 624.00 | 1379.40 1380.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 100_ || 70.26 70.32 | 91.92 91.92 | 138.00 138.06 | //: 32-bit, MSC_v9.00 [ C =332]
+ 1000_ || 44.88 44.89 | 50.20 50.20 | 60.44 60.45 | //: 32-bit, MSC_v9.00 [ C =332]
+ 10000_ || 42.42 42.42 | 46.30 46.31 | 53.29 53.31 | //: 32-bit, MSC_v9.00 [ C =332]
+ 100000_ || 42.21 42.50 | 43.60 45.77 | 49.55 50.03 | //: 32-bit, MSC_v9.00 [ C =332]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+ Block || 4560 bytes | 9232 bytes | 12560 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+ 1_ || 780.00 798.00 | 1920.00 1920.00 | 3732.00 3732.00 | //: 64-bit, MSC_v9.00 [ C =332]
+ 10_ || 76.80 76.80 | 189.00 191.40 | 402.60 402.60 | //: 64-bit, MSC_v9.00 [ C =332]
+ 100_ || 17.10 17.16 | 27.66 27.90 | 37.62 37.62 | //: 64-bit, MSC_v9.00 [ C =332]
+ 1000_ || 9.98 10.12 | 14.23 14.25 | 16.13 16.13 | //: 64-bit, MSC_v9.00 [ C =332]
+ 10000_ || 9.27 9.28 | 12.89 12.99 | 13.98 13.98 | //: 64-bit, MSC_v9.00 [ C =332]
+ 100000_ || 9.32 9.56 | 13.12 13.19 | 14.15 14.23 | //: 64-bit, MSC_v9.00 [ C =332]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+ Block || 1200 bytes | 2928 bytes | 5008 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+ 1_ || 2598.00 2604.00 | 4866.00 4878.00 | 10614.00 10632.00 | //: 32-bit, MSC_v9.00 [asm=332]
+ 10_ || 260.40 261.00 | 490.20 490.20 | 1067.40 1067.40 | //: 32-bit, MSC_v9.00 [asm=332]
+ 100_ || 60.78 60.78 | 72.00 72.00 | 106.86 106.92 | //: 32-bit, MSC_v9.00 [asm=332]
+ 1000_ || 38.38 38.42 | 39.17 39.19 | 46.49 46.61 | //: 32-bit, MSC_v9.00 [asm=332]
+ 10000_ || 40.98 47.69 | 35.81 35.86 | 40.96 43.93 | //: 32-bit, MSC_v9.00 [asm=332]
+ 100000_ || 34.46 36.34 | 34.07 37.16 | 39.60 43.18 | //: 32-bit, MSC_v9.00 [asm=332]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+ Block || 3060 bytes | 6300 bytes | 8835 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+ 1_ || 684.00 690.00 | 1104.00 1104.00 | 2028.00 2034.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 10_ || 70.80 70.80 | 120.00 120.00 | 219.00 219.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 100_ || 15.72 15.72 | 16.74 16.74 | 22.20 22.20 | //: 64-bit, MSC_v9.00 [asm=332]
+ 1000_ || 8.42 8.42 | 7.22 7.22 | 8.30 8.30 | //: 64-bit, MSC_v9.00 [asm=332]
+ 10000_ || 7.85 8.51 | 6.58 6.58 | 7.11 7.12 | //: 64-bit, MSC_v9.00 [asm=332]
+ 100000_ || 7.80 9.43 | 6.90 7.71 | 7.18 8.48 | //: 64-bit, MSC_v9.00 [asm=332]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+ Block || 1288 bytes | 2182 bytes | 3449 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+ 1_ || 2994.00 2994.00 | 6240.00 6240.00 | 14598.00 14604.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 10_ || 300.60 301.20 | 624.00 624.60 | 1459.20 1461.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 100_ || 70.62 70.68 | 91.86 91.92 | 146.10 146.16 | //: 32-bit, MSC_v9.00 [ C =335]
+ 1000_ || 44.65 44.65 | 50.20 50.20 | 62.74 62.76 | //: 32-bit, MSC_v9.00 [ C =335]
+ 10000_ || 42.16 42.42 | 46.31 46.73 | 55.11 55.13 | //: 32-bit, MSC_v9.00 [ C =335]
+ 100000_ || 40.09 40.55 | 45.76 45.97 | 51.00 53.08 | //: 32-bit, MSC_v9.00 [ C =335]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+ Block || 4560 bytes | 9232 bytes | 29280 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+ 1_ || 780.00 798.00 | 1890.00 1920.00 | 3498.00 3498.00 | //: 64-bit, MSC_v9.00 [ C =335]
+ 10_ || 77.40 78.00 | 190.80 195.00 | 350.40 379.20 | //: 64-bit, MSC_v9.00 [ C =335]
+ 100_ || 17.10 17.10 | 27.72 28.08 | 35.28 35.28 | //: 64-bit, MSC_v9.00 [ C =335]
+ 1000_ || 9.95 10.00 | 14.23 14.24 | 15.09 15.10 | //: 64-bit, MSC_v9.00 [ C =335]
+ 10000_ || 9.30 10.06 | 12.94 14.10 | 13.07 14.36 | //: 64-bit, MSC_v9.00 [ C =335]
+ 100000_ || 9.33 9.58 | 13.94 13.95 | 13.24 13.92 | //: 64-bit, MSC_v9.00 [ C =335]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+ Block || 1200 bytes | 2928 bytes | 10880 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+ 1_ || 2586.00 2592.00 | 4896.00 4902.00 | 10668.00 10668.00 | //: 32-bit, MSC_v9.00 [asm=335]
+ 10_ || 263.40 263.40 | 489.60 489.60 | 1069.20 1069.80 | //: 32-bit, MSC_v9.00 [asm=335]
+ 100_ || 61.08 61.14 | 72.30 72.36 | 107.04 107.10 | //: 32-bit, MSC_v9.00 [asm=335]
+ 1000_ || 35.57 35.57 | 36.11 36.12 | 43.07 43.12 | //: 32-bit, MSC_v9.00 [asm=335]
+ 10000_ || 33.68 34.51 | 33.29 36.32 | 37.91 39.80 | //: 32-bit, MSC_v9.00 [asm=335]
+ 100000_ || 36.32 36.43 | 35.91 35.98 | 38.02 38.19 | //: 32-bit, MSC_v9.00 [asm=335]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+ Block || 3060 bytes | 6300 bytes | 20391 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+ 1_ || 684.00 690.00 | 1104.00 1104.00 | 2022.00 2022.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 10_ || 65.40 65.40 | 109.80 109.80 | 201.60 202.20 | //: 64-bit, MSC_v9.00 [asm=335]
+ 100_ || 15.78 15.78 | 16.80 16.80 | 22.02 22.08 | //: 64-bit, MSC_v9.00 [asm=335]
+ 1000_ || 8.41 8.42 | 7.21 7.22 | 8.24 8.26 | //: 64-bit, MSC_v9.00 [asm=335]
+ 10000_ || 7.84 7.84 | 6.45 6.50 | 7.12 7.12 | //: 64-bit, MSC_v9.00 [asm=335]
+ 100000_ || 8.11 8.11 | 6.49 6.74 | 6.95 7.26 | //: 64-bit, MSC_v9.00 [asm=335]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=335]
+ Block || 1288 bytes | 2182 bytes | 7133 bytes | //: 64-bit, MSC_v9.00 [asm=335]
diff --git a/Additional_Implementations/skein_block_x64.asm b/Additional_Implementations/skein_block_x64.asm
new file mode 100644
index 000000000000..b5221ae423ad
--- /dev/null
+++ b/Additional_Implementations/skein_block_x64.asm
@@ -0,0 +1,1335 @@
+;
+;----------------------------------------------------------------
+; 64-bit x86 assembler code (Microsoft ML64) for Skein block functions
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+ .code
+;
+_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits
+_MAX_FRAME_ equ 240
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_ = SKEIN_USE_ASM
+else
+_USE_ASM_ = _MASK_ALL_
+endif
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP ;configure loop unrolling
+_SKEIN_LOOP = 0 ;default is all fully unrolled
+else
+_SKEIN_LOOP = SKEIN_LOOP
+endif
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10
+;
+SKEIN_ASM_UNROLL = 0
+ irp _NN_,<256,512,1024>
+ if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_
+ endif
+ endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5)
+endif
+;
+irp _NN_,<256,512,1024>
+ if _USE_ASM_ and _NN_
+ irp _RR_,<%(ROUNDS_&_NN_)>
+ if _NN_ eq 1024
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ else
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ endif
+ endm
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_CODE_SIZE
+ifdef SKEIN_PERF
+SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+else
+_SKEIN_DEBUG = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS = 0 ;# bits of hash output
+BCNT = 8 + HASH_BITS ;number of bytes in BUFFER[]
+TWEAK = 8 + BCNT ;tweak values[0..1]
+X_VARS = 16 + TWEAK ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+r08 equ <r8>
+r09 equ <r9>
+;
+KW_PARITY = 01BD11BDAA9FC1A22h ;overall parity of key schedule words
+FIRST_MASK = NOT (1 SHL 62)
+;
+; rotation constants for Skein
+;
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+; Input: reg
+; Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+;
+RotL64 macro reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 )
+ if _RCNT_ ;is there anything to do?
+ rol reg,_RCNT_
+ endif
+endm
+;
+;----------------------------------------------------------------
+;
+; MACROS: define local vars and configure stack
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar macro localName,localSize
+localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro BLK_BITS,KS_CNT,NO_FRAME,debugCnt
+ WCNT = (BLK_BITS)/64
+;
+_PushCnt_ = 0 ;save nonvolatile regs on stack
+ irp _reg_,<rbp,rsi,rdi,rbx,r12,r13,r14,r15>
+ push _reg_
+ .pushreg _reg_ ;pseudo-op push for exception handling
+_PushCnt_ = _PushCnt_ + 1 ;track count to keep alignment
+ endm
+;
+_STK_OFFS_ = 0 ;starting offset from rsp
+ ;---- local variables ;<-- rsp
+ StackVar X_stk ,8*(WCNT) ;local context vars
+ StackVar ksTwk ,8*3 ;key schedule: tweak words
+ StackVar ksKey ,8*(WCNT)+8 ;key schedule: key words
+ if (SKEIN_ASM_UNROLL and (BLK_BITS)) eq 0
+ StackVar ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen
+ endif
+ StackVar Wcopy ,8*(WCNT) ;copy of input block
+ if _SKEIN_DEBUG
+ ifnb <debugCnt> ;temp location for debug X[] info
+ StackVar xDebug_&BLK_BITS ,8*(debugCnt)
+ endif
+ endif
+ if ((8*_PushCnt_ + _STK_OFFS_) and 8) eq 0
+ StackVar align16,8 ;keep 16-byte aligned (adjust for retAddr?)
+tmpStk_&BLK_BITS = align16 ;use this
+ endif
+LOCAL_SIZE = _STK_OFFS_ ;size of local vars
+ ;----
+ StackVar savRegs,8*_PushCnt_ ;saved registers
+ StackVar retAddr,8 ;return address
+ ;---- caller parameters
+ StackVar ctxPtr ,8 ;context ptr
+ StackVar blkPtr ,8 ;pointer to block data
+ StackVar blkCnt ,8 ;number of full blocks to process
+ StackVar bitAdd ,8 ;bit count to add to tweak
+ ;---- caller's stack frame
+;
+; set up the stack frame pointer (rbp)
+;
+FRAME_OFFS = ksTwk + 128 ;allow short (negative) offset to ksTwk, kwKey
+ if FRAME_OFFS gt _STK_OFFS_ ;keep rbp in the "locals" range
+FRAME_OFFS = _STK_OFFS_
+ endif
+ if FRAME_OFFS gt _MAX_FRAME_ ;keep Microsoft .setframe happy
+FRAME_OFFS = _MAX_FRAME_
+ endif
+;
+ifdef SKEIN_ASM_INFO
+ if FRAME_OFFS+128 lt savRegs
+%out +++ SKEIN_&BLK_BITS: Unable to reach all of Wcopy with short offset from rbp.
+ elseif FRAME_OFFS+128 lt Wcopy
+%out +++ SKEIN_&BLK_BITS: Unable to reach end of Wcopy with short offset from rbp.
+ elseif FRAME_OFFS+128 lt _STK_OFFS_
+%out +++ SKEIN_&BLK_BITS: Unable to reach caller parms with short offset from rbp
+ endif
+endif
+ ;put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_&BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_&BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_&BLK_BITS = FRAME_OFFS
+;
+; Notes on stack frame setup:
+; * the most frequently used variable is X_stk[], based at [rsp+0]
+; * the next most used is the key schedule arrays, ksKey and ksTwk
+; so rbp is "centered" there, allowing short offsets to the key
+; schedule even in 1024-bit Skein case
+; * the Wcopy variables are infrequently accessed, but they have long
+; offsets from both rsp and rbp only in the 1024-bit case.
+; * all other local vars and calling parameters can be accessed
+; with short offsets, except in the 1024-bit case
+;
+ sub rsp,LOCAL_SIZE ;make room for the locals
+ .allocstack LOCAL_SIZE ;pseudo op for exception handling
+ lea rbp,[rsp+FRAME_OFFS] ;maximize use of short offsets
+ ifb <NO_FRAME>
+ .setframe rbp, FRAME_OFFS ;pseudo op for exception handling
+ endif
+ mov [FP_+ctxPtr],rcx ;save caller's parameters on the stack
+ mov [FP_+blkPtr],rdx
+ mov [FP_+blkCnt],r08
+ mov [FP_+bitAdd],r09
+ .endprolog ;pseudo op to support exception handling
+
+ mov rdi,[FP_+ctxPtr ] ;rdi --> context
+;
+endm ;Setup_Stack
+;
+FP_ equ <rbp-FRAME_OFFS> ;keep as many short offsets as possible
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro procStart
+ add rsp,LOCAL_SIZE ;get rid of locals (wipe??)
+ irp _reg_,<r15,r14,r13,r12,rbx,rdi,rsi,rbp>
+ pop _reg_
+_PushCnt_ = _PushCnt_ - 1
+ endm
+ if _PushCnt_
+ .err "Mismatched push/pops?"
+ endif
+
+ ;display code size in bytes to stdout
+ irp _BCNT_,<%($+1-procStart)> ;account for return opcode
+_ProcBytes_ = _BCNT_
+if _BCNT_ ge 10000
+%out procStart code size = _BCNT_ bytes
+elseif _BCNT_ ge 1000
+%out procStart code size = _BCNT_ bytes
+else
+%out procStart code size = _BCNT_ bytes
+endif
+ endm ;irp _BCNT_
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+ extrn Skein_Show_Block:proc ;calls to C routines
+ extrn Skein_Show_Round:proc
+;
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+; const u08b_t *blkPtr, const u64b_t *wPtr,
+; const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+ irp _reg_,<rax,rcx,rdx,r08,r09,r10,r11>
+ push _reg_ ;save all volatile regs on tack before the call
+ endm
+ ; get and push call parameters
+ lea rax,[FP_+ksTwk] ;tweak pointer
+ push rax
+ lea rax,[FP_+ksKey] ;key pointer
+ push rax
+ lea rax,[FP_+Wcopy] ;wPtr
+ push rax
+ mov r09,[FP_+blkPtr] ;blkPtr
+ push r09 ;(push register parameters anyway to make room on stack)
+ mov rdx,[FP_+ctxPtr]
+ lea r08,[rdx+X_VARS] ;X (pointer)
+ push r08
+ push rdx ;h (pointer)
+ mov rcx, BLK_BITS ;bits
+ push rdx
+ call Skein_Show_Block ;call external debug handler
+ add rsp,7*8 ;discard parameters on stack
+ irp _reg_,<r11,r10,r09,r08,rdx,rcx,rax>
+ pop _reg_ ;restore regs
+ endm
+endm ; Skein_Debug_Block
+;
+;
+; the macro to "call" to debug a round
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+ ; call the appropriate (local) debug function
+ push r08
+ if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+ mov r08, R
+ else ;compute round number using edi
+_rOffs_ = RDI_OFFS + 0
+ if BLK_BITS eq 1024
+ mov r08,[rsp+8+rIdx_offs] ;get rIdx off the stack (adjust for push r08)
+ lea r08,[4*r08+1+(((R)-1) and 3)+_rOffs_]
+ else
+ lea r08,[4*rdi+1+(((R)-1) and 3)+_rOffs_]
+ endif
+ endif
+ call Skein_Debug_Round_&BLK_BITS
+ pop r08
+;
+ afterOp
+endm ; Skein_Debug_Round
+else ;------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+Skein_Debug_Block macro BLK_BITS,afterOp
+endm
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+endm
+;
+endif ; _SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+;
+addReg macro dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+ ifnb <immOffs>
+ lea dstReg,[srcReg_A&&srcReg_B + dstReg + immOffs]
+ elseif ((useAddOp + 0) eq 0)
+ ifndef ASM_NO_LEA
+ ;lea seems to be faster on Core 2 Duo CPUs!
+ lea dstReg,[srcReg_A&&srcReg_B + dstReg]
+ else
+ add dstReg, srcReg_A&&srcReg_B
+ endif
+ else
+ add dstReg, srcReg_A&&srcReg_B
+ endif
+endm
+;
+;=================================== Skein_256 =============================================
+;
+if _USE_ASM_ and 256
+ public Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+Skein_256_Process_Block proc frame
+ Setup_Stack 256,((ROUNDS_256/8)+1)
+ mov r14,[rdi+TWEAK+8]
+ jmp short Skein_256_block_loop
+ align 16
+ ; main hash loop for Skein_256
+Skein_256_block_loop:
+ ;
+ ; general register usage:
+ ; RAX..RDX = X0..X3
+ ; R08..R12 = ks[0..4]
+ ; R13..R15 = ts[0..2]
+ ; RSP, RBP = stack/frame pointers
+ ; RDI = round counter or context pointer
+ ; RSI = temp
+ ;
+ mov r13,[rdi+TWEAK+0]
+ add r13,[FP_+bitAdd] ;computed updated tweak value T0
+ mov r15,r14
+ xor r15,r13 ;now r13.r15 is set as the tweak
+
+ mov r12,KW_PARITY
+ mov r08,[rdi+X_VARS+ 0]
+ mov r09,[rdi+X_VARS+ 8]
+ mov r10,[rdi+X_VARS+16]
+ mov r11,[rdi+X_VARS+24]
+ mov [rdi+TWEAK+0],r13 ;save updated tweak value ctx->h.T[0]
+ xor r12,r08 ;start accumulating overall parity
+
+ mov rsi,[FP_+blkPtr ] ;esi --> input block
+ xor r12,r09
+ mov rax,[rsi+ 0] ;get X[0..3]
+ xor r12,r10
+ mov rbx,[rsi+ 8]
+ xor r12,r11
+ mov rcx,[rsi+16]
+ mov rdx,[rsi+24]
+
+ mov [FP_+Wcopy+ 0],rax ;save copy of input block
+ mov [FP_+Wcopy+ 8],rbx
+ mov [FP_+Wcopy+16],rcx
+ mov [FP_+Wcopy+24],rdx
+
+ add rax, r08 ;initial key injection
+ add rbx, r09
+ add rcx, r10
+ add rdx, r11
+ add rbx, r13
+ add rcx, r14
+
+if _SKEIN_DEBUG
+ mov [rdi+TWEAK+ 8],r14 ;save updated tweak T[1] (start bit cleared?)
+ mov [FP_+ksKey+ 0],r08 ;save key schedule on stack for Skein_Debug_Block
+ mov [FP_+ksKey+ 8],r09
+ mov [FP_+ksKey+16],r10
+ mov [FP_+ksKey+24],r11
+ mov [FP_+ksKey+32],r12
+
+ mov [FP_+ksTwk+ 0],r13
+ mov [FP_+ksTwk+ 8],r14
+ mov [FP_+ksTwk+16],r15
+
+ mov [rsp+X_stk + 0],rax ;save X[] on stack for Skein_Debug_Block
+ mov [rsp+X_stk + 8],rbx
+ mov [rsp+X_stk +16],rcx
+ mov [rsp+X_stk +24],rdx
+
+ Skein_Debug_Block 256 ;debug dump
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+endif
+;
+if ((SKEIN_ASM_UNROLL and 256) eq 0)
+ mov [FP_+ksKey+40],r08 ;save key schedule on stack for looping code
+ mov [FP_+ksKey+ 8],r09
+ mov [FP_+ksKey+16],r10
+ mov [FP_+ksKey+24],r11
+ mov [FP_+ksKey+32],r12
+
+ mov [FP_+ksTwk+24],r13
+ mov [FP_+ksTwk+ 8],r14
+ mov [FP_+ksTwk+16],r15
+endif
+ add rsi, WCNT*8 ;skip the block
+ mov [FP_+blkPtr ],rsi ;update block pointer
+;
+opLoop macro op1,op2
+ if (SKEIN_ASM_UNROLL and 256) eq 0
+ op1
+ else
+ op2
+ endif
+endm
+;
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT = ROUNDS_256/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_256
+ if ((ROUNDS_256/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_256"
+ endif
+ xor rdi,rdi ;rdi = iteration count
+Skein_256_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ ; all X and ks vars in regs ; (ops to "rotate" ks vars, via mem, if not unrolled)
+ ; round 4*_RBase_ + 0
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_RBase_+0) and 7),0
+ addReg rcx, rdx
+ opLoop <mov r08,[FP_+ksKey+8*rdi+8*1]>
+ xor rbx, rax
+ RotL64 rdx, 256,%((4*_RBase_+0) and 7),1
+ xor rdx, rcx
+ if SKEIN_ASM_UNROLL and 256
+ irp _r0_,<%(08+(_Rbase_+3) mod 5)>
+ irp _r1_,<%(13+(_Rbase_+2) mod 3)>
+ lea rdi,[r&_r0_+r&_r1_] ;precompute key injection value for rcx
+ endm
+ endm
+ endif
+ opLoop <mov r13,[FP_+ksTwk+8*rdi+8*1]>
+ Skein_Debug_Round 256,%(4*_RBase_+1)
+
+ ; round 4*_RBase_ + 1
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_RBase_+1) and 7),0
+ xor rdx, rax
+ opLoop <mov r09,[FP_+ksKey+8*rdi+8*2]>
+ addReg rcx, rbx
+ RotL64 rbx, 256,%((4*_RBase_+1) and 7),1
+ xor rbx, rcx
+ opLoop <mov r11,[FP_+ksKey+8*rdi+8*4]>
+ Skein_Debug_Round 256,%(4*_RBase_+2)
+ if SKEIN_ASM_UNROLL and 256
+ irp _r0_,<%(08+(_Rbase_+2) mod 5)>
+ irp _r1_,<%(13+(_Rbase_+1) mod 3)>
+ lea rsi,[r&_r0_+r&_r1_] ;precompute key injection value for rbx
+ endm
+ endm
+ endif
+ ; round 4*_RBase_ + 2
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_RBase_+2) and 7),0
+ addReg rcx, rdx
+ opLoop <mov r10,[FP_+ksKey+8*rdi+8*3]>
+ xor rbx, rax
+ RotL64 rdx, 256,%((4*_RBase_+2) and 7),1
+ xor rdx, rcx
+ opLoop <mov [FP_+ksKey+8*rdi+8*6],r08> ;"rotate" the key
+ opLoop <lea r11,[r11+rdi+1]> ;precompute key + tweak
+ Skein_Debug_Round 256,%(4*_RBase_+3)
+ ; round 4*_RBase_ + 3
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_RBase_+3) and 7),0
+ addReg rcx, rbx
+ opLoop <add r10,[FP_+ksTwk+8*rdi+8*2]> ;precompute key + tweak
+ opLoop <mov [FP_+ksTwk+8*rdi+8*4],r13> ;"rotate" the tweak
+ xor rdx, rax
+ RotL64 rbx, 256,%((4*_RBase_+3) and 7),1
+ xor rbx, rcx
+ Skein_Debug_Round 256,%(4*_RBase_+4)
+ opLoop <addReg r09,r13> ;precompute key+tweak
+ ;inject key schedule words
+_Rbase_ = _Rbase_+1
+ if SKEIN_ASM_UNROLL and 256
+ addReg rax,r,%(08+((_Rbase_+0) mod 5))
+ addReg rbx,rsi
+ addReg rcx,rdi
+ addReg rdx,r,%(08+((_Rbase_+3) mod 5)),,_Rbase_
+ else
+ inc rdi
+ addReg rax,r08
+ addReg rcx,r10
+ addReg rbx,r09
+ addReg rdx,r11
+ endif
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+endm ;rept _UNROLL_CNT
+
+;
+if (SKEIN_ASM_UNROLL and 256) eq 0
+ cmp rdi,2*(ROUNDS_256/8)
+ jb Skein_256_round_loop
+endif ; (SKEIN_ASM_UNROLL and 256) eq 0
+ mov rdi,[FP_+ctxPtr ] ;restore edi --> context
+
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ xor rax,[FP_+Wcopy + 0]
+ mov r14,FIRST_MASK
+ xor rbx,[FP_+Wcopy + 8]
+ xor rcx,[FP_+Wcopy +16]
+ xor rdx,[FP_+Wcopy +24]
+ mov [rdi+X_VARS+ 0],rax ;store final result
+ and r14,[rdi+TWEAK + 8]
+ dec qword ptr [FP_+blkCnt] ;set zero flag
+ mov [rdi+X_VARS+ 8],rbx
+ mov [rdi+X_VARS+16],rcx
+ mov [rdi+X_VARS+24],rdx
+
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,,<cmp qword ptr [FP_+blkCnt],0>
+
+ ; go back for more blocks, if needed
+ jnz Skein_256_block_loop
+ mov [rdi+TWEAK + 8],r14
+ Reset_Stack Skein_256_Process_Block
+ ret
+
+ if _SKEIN_DEBUG
+Skein_Debug_Round_256:
+ mov [FP_+X_stk+ 0],rax ;first, save X[] state on stack so debug routines can access it
+ mov [FP_+X_stk+ 8],rbx ;(use FP_ since rsp has changed!)
+ mov [FP_+X_stk+16],rcx
+ mov [FP_+X_stk+24],rdx
+ push rdx ;save two regs for BLK_BITS-specific parms
+ push rcx
+ mov rdx,[FP_+ctxPtr] ;ctx_hdr_ptr
+ mov rcx, 256
+ jmp Skein_Debug_Round_Common
+ endif
+
+Skein_256_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+ public Skein_256_Process_Block_CodeSize
+Skein_256_Process_Block_CodeSize proc
+ mov rax,_ProcBytes_
+ ret
+Skein_256_Process_Block_CodeSize endp
+;
+ public Skein_256_Unroll_Cnt
+Skein_256_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_256/8
+ mov rax,_UNROLL_CNT
+ else
+ xor rax,rax
+ endif
+ ret
+Skein_256_Unroll_Cnt endp
+endif
+;
+endif ;_USE_ASM_ and 256
+;
+;=================================== Skein_512 =============================================
+;
+if _USE_ASM_ and 512
+ public Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+rX_512_0 equ r08 ;register assignments for X[] values during rounds
+rX_512_1 equ r09
+rX_512_2 equ r10
+rX_512_3 equ r11
+rX_512_4 equ r12
+rX_512_5 equ r13
+rX_512_6 equ r14
+rX_512_7 equ r15
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round for 512-bit blocks
+;
+R_512_OneRound macro r0,r1,r2,r3,r4,r5,r6,r7,_Rn_,op1,op2,op3,op4
+;
+ addReg rX_512_&r0, rX_512_&r1
+ RotL64 rX_512_&r1, 512,%((_Rn_) and 7),0
+ xor rX_512_&r1, rX_512_&r0
+ op1
+ addReg rX_512_&r2, rX_512_&r3
+ RotL64 rX_512_&r3, 512,%((_Rn_) and 7),1
+ xor rX_512_&r3, rX_512_&r2
+ op2
+ addReg rX_512_&r4, rX_512_&r5
+ RotL64 rX_512_&r5, 512,%((_Rn_) and 7),2
+ xor rX_512_&r5, rX_512_&r4
+ op3
+ addReg rX_512_&r6, rX_512_&r7
+ RotL64 rX_512_&r7, 512,%((_Rn_) and 7),3
+ xor rX_512_&r7, rX_512_&r6
+ op4
+ Skein_Debug_Round 512,%(_Rn_+1),-4
+;
+endm ;R_512_OneRound
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: eight rounds for 512-bit blocks
+;
+R_512_FourRounds macro _RR_ ;RR = base round number (0 mod 8)
+ if SKEIN_ASM_UNROLL and 512
+ ; here for fully unrolled case.
+ _II_ = ((_RR_)/4) + 1 ;key injection counter
+ R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),<mov rax,[FP_+ksKey+8*(((_II_)+3) mod 9)]>,,<mov rbx,[FP_+ksKey+8*(((_II_)+4) mod 9)]>
+ R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),<mov rcx,[FP_+ksKey+8*(((_II_)+5) mod 9)]>,,<mov rdx,[FP_+ksKey+8*(((_II_)+6) mod 9)]>
+ R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),<mov rsi,[FP_+ksKey+8*(((_II_)+7) mod 9)]>,,<add rcx,[FP_+ksTwk+8*(((_II_)+0) mod 3)]>
+ R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),<add rdx,[FP_+ksTwk+8*(((_II_)+1) mod 3)]>,
+ ; inject the key schedule
+ add r08,[FP_+ksKey+8*(((_II_)+0) mod 9)]
+ addReg r11,rax
+ add r09,[FP_+ksKey+8*(((_II_)+1) mod 9)]
+ addReg r12,rbx
+ add r10,[FP_+ksKey+8*(((_II_)+2) mod 9)]
+ addReg r13,rcx
+ addReg r14,rdx
+ addReg r15,rsi,,,(_II_)
+ else
+ ; here for looping case ;"rotate" key/tweak schedule (move up on stack)
+ inc rdi ;bump key injection counter
+ R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),<mov rdx,[FP_+ksKey+8*rdi+8*6]>,<mov rax,[FP_+ksTwk+8*rdi-8*1]> ,<mov rsi,[FP_+ksKey+8*rdi-8*1]>
+ R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),<mov rcx,[FP_+ksKey+8*rdi+8*5]>,<mov [FP_+ksTwk+8*rdi+8*2],rax>,<mov [FP_+ksKey+8*rdi+8*8],rsi>
+ R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),<mov rbx,[FP_+ksKey+8*rdi+8*4]>,<add rdx,[FP_+ksTwk+8*rdi+8*1]> ,<mov rsi,[FP_+ksKey+8*rdi+8*7]>
+ R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),<mov rax,[FP_+ksKey+8*rdi+8*3]>,<add rcx,[FP_+ksTwk+8*rdi+8*0]>
+ ; inject the key schedule
+ add r08,[FP_+ksKey+8*rdi+8*0]
+ addReg r11,rax
+ addReg r12,rbx
+ add r09,[FP_+ksKey+8*rdi+8*1]
+ addReg r13,rcx
+ addReg r14,rdx
+ add r10,[FP_+ksKey+8*rdi+8*2]
+ addReg r15,rsi
+ addReg r15,rdi ;inject the round number
+ endif
+ ;show the result of the key injection
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
+endm ;R_512_EightRounds
+;
+;;;;;;;;;;;;;;;;;
+; instantiated code
+;
+Skein_512_Process_Block proc frame
+ Setup_Stack 512,ROUNDS_512/8
+ mov rbx,[rdi+TWEAK+ 8]
+ jmp short Skein_512_block_loop
+ align 16
+ ; main hash loop for Skein_512
+Skein_512_block_loop:
+ ; general register usage:
+ ; RAX..RDX = temps for key schedule pre-loads
+ ; R08..R15 = X0..X7
+ ; RSP, RBP = stack/frame pointers
+ ; RDI = round counter or context pointer
+ ; RSI = temp
+ ;
+ mov rax,[rdi+TWEAK+ 0]
+ add rax,[FP_+bitAdd] ;computed updated tweak value T0
+ mov rcx,rbx
+ xor rcx,rax ;rax/rbx/rcx = tweak schedule
+ mov [rdi+TWEAK+ 0],rax ;save updated tweak value ctx->h.T[0]
+ mov [FP_+ksTwk+ 0],rax
+ mov rdx,KW_PARITY
+ mov rsi,[FP_+blkPtr ] ;rsi --> input block
+ mov [FP_+ksTwk+ 8],rbx
+ mov [FP_+ksTwk+16],rcx
+
+ irp _Rn_,<0,1,2,3,4,5,6,7>
+ mov rX_512_&_Rn_,[rdi+X_VARS+8*(_Rn_)]
+ xor rdx,rX_512_&_Rn_ ;compute overall parity
+ mov [FP_+ksKey+8*(_Rn_)],rX_512_&_Rn_
+ endm ;load state into r08..r15, compute parity
+ mov [FP_+ksKey+8*(8)],rdx ;save key schedule parity
+
+ addReg rX_512_5,rax ;precompute key injection for tweak
+ addReg rX_512_6,rbx
+if _SKEIN_DEBUG
+ mov [rdi+TWEAK+ 8],rbx ;save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
+endif
+ mov rax,[rsi+ 0] ;load input block
+ mov rbx,[rsi+ 8]
+ mov rcx,[rsi+16]
+ mov rdx,[rsi+24]
+ addReg r08,rax ;do initial key injection
+ addReg r09,rbx
+ mov [FP_+Wcopy+ 0],rax ;keep local copy for feedforward
+ mov [FP_+Wcopy+ 8],rbx
+ addReg r10,rcx
+ addReg r11,rdx
+ mov [FP_+Wcopy+16],rcx
+ mov [FP_+Wcopy+24],rdx
+
+ mov rax,[rsi+32]
+ mov rbx,[rsi+40]
+ mov rcx,[rsi+48]
+ mov rdx,[rsi+56]
+ addReg r12,rax
+ addReg r13,rbx
+ addReg r14,rcx
+ addReg r15,rdx
+ mov [FP_+Wcopy+32],rax
+ mov [FP_+Wcopy+40],rbx
+ mov [FP_+Wcopy+48],rcx
+ mov [FP_+Wcopy+56],rdx
+
+if _SKEIN_DEBUG
+ irp _Rn_,<0,1,2,3,4,5,6,7> ;save values on stack for debug output
+ mov [rsp+X_stk+8*(_Rn_)],rX_512_&_Rn_
+ endm
+
+ Skein_Debug_Block 512 ;debug dump
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
+endif
+ add rsi, 8*WCNT ;skip the block
+ mov [FP_+blkPtr ],rsi ;update block pointer
+ ;
+ ;;;;;;;;;;;;;;;;;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT = ROUNDS_512/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ if ((ROUNDS_512/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_512"
+ endif
+ xor rdi,rdi ;rdi = round counter
+Skein_512_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ R_512_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+ cmp rdi,2*(ROUNDS_512/8)
+ jb Skein_512_round_loop
+ mov rdi,[FP_+ctxPtr ] ;restore rdi --> context
+endif
+ ; end of rounds
+ ;;;;;;;;;;;;;;;;;
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+ irp _Rn_,<0,1,2,3,4,5,6,7>
+ if (_Rn_ eq 0)
+ mov rbx,FIRST_MASK
+ endif
+ xor rX_512_&_Rn_,[FP_+Wcopy+8*(_Rn_)] ;feedforward XOR
+ mov [rdi+X_VARS+8*(_Rn_)],rX_512_&_Rn_ ;and store result
+ if (_Rn_ eq 6)
+ and rbx,[rdi+TWEAK+ 8]
+ endif
+ endm
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+
+ ; go back for more blocks, if needed
+ dec qword ptr [FP_+blkCnt]
+ jnz Skein_512_block_loop
+ mov [rdi+TWEAK + 8],rbx
+
+ Reset_Stack Skein_512_Process_Block
+ ret
+;
+ if _SKEIN_DEBUG
+; call here with r08 = "round number"
+Skein_Debug_Round_512:
+ push rdx ;save two regs for BLK_BITS-specific parms
+ push rcx
+ mov rcx,[rsp+24] ;get back original r08 (pushed on stack in macro call)
+ mov [FP_+X_stk],rcx ;and save it in X_stk
+ irp _Rn_,<1,2,3,4,5,6,7> ;save rest of X[] state on stack so debug routines can access it
+ mov [FP_+X_stk+8*(_Rn_)],rX_512_&_Rn_
+ endm
+ mov rdx,[FP_+ctxPtr] ;ctx_hdr_ptr
+ mov rcx, 512 ;block size
+ jmp Skein_Debug_Round_Common
+ endif
+;
+Skein_512_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+ public Skein_512_Process_Block_CodeSize
+Skein_512_Process_Block_CodeSize proc
+ mov rax,_ProcBytes_
+ ret
+Skein_512_Process_Block_CodeSize endp
+;
+ public Skein_512_Unroll_Cnt
+Skein_512_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_512/8
+ mov rax,_UNROLL_CNT
+ else
+ xor rax,rax
+ endif
+ ret
+Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;=================================== Skein1024 =============================================
+if _USE_ASM_ and 1024
+ public Skein1024_Process_Block
+;
+; void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; use details of permutation to make register assignments
+;
+r1K_x0 equ rdi
+r1K_x1 equ rsi
+r1K_x2 equ rbp
+r1K_x3 equ rax
+r1K_x4 equ rcx ;"shared" with X6, since X4/X6 alternate
+r1K_x5 equ rbx
+r1K_x6 equ rcx
+r1K_x7 equ rdx
+r1K_x8 equ r08
+r1K_x9 equ r09
+r1K_xA equ r10
+r1K_xB equ r11
+r1K_xC equ r12
+r1K_xD equ r13
+r1K_xE equ r14
+r1K_xF equ r15
+;
+rIdx equ r1K_x0 ;index register for looping versions
+rIdx_offs equ tmpStk_1024
+;
+R1024_Mix macro w0,w1,_RN0_,_Rn1_,op1
+_w0 = 0&w0&h ;handle the hex conversion
+_w1 = 0&w1&h
+_II_ = ((_RN0_)/4)+1 ;injection count
+ ;
+ addReg r1K_x&w0 , r1K_x&w1 ;perform the MIX
+ RotL64 r1K_x&w1 , 1024,%((_RN0_) and 7),_Rn1_
+ xor r1K_x&w1 , r1K_x&w0
+ if ((_RN0_) and 3) eq 3 ;time to do key injection?
+ if _SKEIN_DEBUG
+ mov [rsp+xDebug_1024+8*_w0],r1K_x&w0 ;save intermediate values for Debug_Round
+ mov [rsp+xDebug_1024+8*_w1],r1K_x&w1 ; (before inline key injection)
+ endif
+ if SKEIN_ASM_UNROLL and 1024 ;here to do fully unrolled key injection
+ add r1K_x&w0, [rsp+ksKey+ 8*((_II_+_w0) mod 17)]
+ add r1K_x&w1, [rsp+ksKey+ 8*((_II_+_w1) mod 17)]
+ if _w1 eq 13 ;tweak injection
+ add r1K_x&w1, [rsp+ksTwk+ 8*((_II_+0 ) mod 3)]
+ elseif _w0 eq 14
+ add r1K_x&w0, [rsp+ksTwk+ 8*((_II_+1 ) mod 3)]
+ elseif _w1 eq 15
+ add r1K_x&w1, _II_ ;(injection counter)
+ endif
+ else ;here to do looping key injection
+ if (_w0 eq 0)
+ mov [rsp+X_stk+8*_w0],r1K_x0 ;if so, store N0 so we can use reg as index
+ mov rIdx, [rsp+rIdx_offs] ;get the injection counter index into rIdx (N0)
+ else
+ add r1K_x&w0, [rsp+ksKey+8+8*rIdx+8*_w0] ;even key injection
+ endif
+ if _w1 eq 13 ;tweak injection
+ add r1K_x&w1, [rsp+ksTwk+8+8*rIdx+8*0 ]
+ elseif _w0 eq 14
+ add r1K_x&w0, [rsp+ksTwk+8+8*rIdx+8*1 ]
+ elseif _w1 eq 15
+ addReg r1K_x&w1, rIdx,,,1 ;(injection counter)
+ endif
+ add r1K_x&w1, [rsp+ksKey+8+8*rIdx+8*_w1] ;odd key injection
+ endif
+ endif
+ ; insert the op provided, if any
+ op1
+endm
+;;;;;;;;;;;;;;;;;
+; MACRO: one round for 1024-bit blocks
+;
+R1024_OneRound macro x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF,_Rn_
+ if (x0 ne 0) or ((x4 ne 4) and (x4 ne 6)) or (x4 ne (x6 xor 2))
+ .err "faulty register assignment!"
+ endif
+ R1024_Mix x0,x1,_Rn_,0
+ R1024_Mix x2,x3,_Rn_,1
+ R1024_Mix x4,x5,_Rn_,2, <mov [rsp+X_stk+8*0&x4&h],r1K_x4> ;save x4 on stack (x4/x6 alternate)
+ R1024_Mix x8,x9,_Rn_,4, <mov r1K_x6,[rsp+X_stk+8*0&x6&h]> ;load x6 from stack
+ R1024_Mix xA,xB,_Rn_,5
+ R1024_Mix xC,xD,_Rn_,6
+ R1024_Mix x6,x7,_Rn_,3
+ R1024_Mix xE,xF,_Rn_,7
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ endif
+endm ;R1024_OneRound
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds for 1024-bit blocks
+;
+R1024_FourRounds macro _RR_ ;RR = base round number (0 mod 4)
+ ; should be here with r1K_x4 set properly, x6 stored on stack
+ R1024_OneRound 0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F,%((_RR_)+0)
+ R1024_OneRound 0,9,2,D,6,B,4,F,A,7,C,3,E,5,8,1,%((_RR_)+1)
+ R1024_Oneround 0,7,2,5,4,3,6,1,C,F,E,D,8,B,A,9,%((_RR_)+2)
+ R1024_Oneround 0,F,2,B,6,D,4,9,E,1,8,5,A,3,C,7,%((_RR_)+3)
+ if (SKEIN_ASM_UNROLL and 1024) eq 0 ;here with r1K_x0 == rIdx, X0 on stack
+ ;rotate the key schedule on the stack
+ mov [rsp+X_stk+ 8* 8],r1K_x8;free up a reg
+ mov r1K_x8,[rsp+ksKey+8*rIdx+8* 0] ;get key
+ mov [rsp+ksKey+8*rIdx+8*17],r1K_x8 ;rotate it (must do key first or tweak clobbers it!)
+ mov r1K_x8,[rsp+ksTwk+8*rIdx+8* 0] ;get tweak
+ mov [rsp+ksTwk+8*rIdx+8* 3],r1K_x8 ;rotate it
+ mov r1K_x8,[rsp+X_stk+ 8* 8] ;get the reg back
+ inc rIdx ;bump the index
+ mov [rsp+rIdx_offs],rIdx ;save it
+ mov r1K_x0,[rsp+ksKey+8*rIdx] ;get the key schedule word for X0
+ add r1K_x0,[rsp+X_stk+8*0] ;perform the X0 key injection
+ endif
+ ;show the result of the key injection
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
+endm ;R1024_FourRounds
+;
+;;;;;;;;;;;;;;;;
+; code
+;
+Skein1024_Process_Block proc frame
+;
+ Setup_Stack 1024,ROUNDS_1024/8,NO_FRAME,<WCNT>
+ mov r09,[rdi+TWEAK+ 8]
+ jmp short Skein1024_block_loop
+ align 16
+ ; main hash loop for Skein1024
+Skein1024_block_loop:
+ ; general register usage:
+ ; RSP = stack pointer
+ ; RAX..RDX,RSI,RDI= X1, X3..X7 (state words)
+ ; R08..R15 = X8..X15 (state words)
+ ; RBP = temp (used for X0 and X2)
+ ;
+ if (SKEIN_ASM_UNROLL and 1024) eq 0
+ xor rax,rax ;init loop index on the stack
+ mov [rsp+rIdx_offs],rax
+ endif
+ mov r08,[rdi+TWEAK+ 0]
+ add r08,[FP_+bitAdd] ;computed updated tweak value T0
+ mov r10,r09
+ xor r10,r08 ;rax/rbx/rcx = tweak schedule
+ mov [rdi+TWEAK+ 0],r08 ;save updated tweak value ctx->h.T[0]
+ mov [FP_+ksTwk+ 0],r08
+ mov [FP_+ksTwk+ 8],r09 ;keep values in r08,r09 for initial tweak injection below
+ mov [FP_+ksTwk+16],r10
+ if _SKEIN_DEBUG
+ mov [rdi+TWEAK+ 8],r09 ;save updated tweak value ctx->h.T[1] for Skein_Debug_Block
+ endif
+ mov rsi ,[FP_+blkPtr ] ;r1K_x2 --> input block
+ mov rax , KW_PARITY ;overall key schedule parity
+
+ ; logic here assumes the set {rdi,rsi,rbp,rax} = r1K_x{0,1,2,3}
+
+ irp _rN_,<0,1,2,3,4,6> ;process the "initial" words, using r14,r15 as temps
+ mov r14,[rdi+X_VARS+8*_rN_] ;get state word
+ mov r15,[rsi+ 8*_rN_] ;get msg word
+ xor rax,r14 ;update key schedule parity
+ mov [FP_+ksKey +8*_rN_],r14 ;save key schedule word on stack
+ mov [FP_+Wcopy +8*_rN_],r15 ;save local msg Wcopy
+ add r14,r15 ;do the initial key injection
+ mov [rsp+X_stk +8*_rN_],r14 ;save initial state var on stack
+ endm
+ ; now process the rest, using the "real" registers
+ ; (MUST do it in reverse order to inject tweaks r08/r09 first)
+ irp _rN_,<F,E,D,C,B,A,9,8,7,5>
+_rr_ = 0&_rN_&h
+ mov r1K_x&_rN_,[rdi+X_VARS+8*_rr_] ;get key schedule word from context
+ mov r1K_x4 ,[rsi+ 8*_rr_] ;get next input msg word
+ mov [rsp+ksKey +8*_rr_],r1K_x&_rN_ ;save key schedule on stack
+ xor rax , r1K_x&_rN_ ;accumulate key schedule parity
+ mov [FP_+Wcopy +8*_rr_],r1K_x4 ;save copy of msg word for feedforward
+ add r1K_x&_rN_, r1K_x4 ;do the initial key injection
+ if _rr_ eq 13 ;do the initial tweak injection
+ addReg r1K_x&_rN_,r08 ; (only in words 13/14)
+ elseif _rr_ eq 14
+ addReg r1K_x&_rN_,r09
+ endif
+ endm
+ mov [FP_+ksKey+8*WCNT],rax ;save key schedule parity
+if _SKEIN_DEBUG
+ Skein_Debug_Block 1024 ;debug dump
+endif
+ addReg rsi,8*WCNT ;bump the msg ptr
+ mov [FP_+blkPtr],rsi ;save bumped msg ptr
+ ; re-load words 0..4 [rbp,rsi,rdi,rax,rbx] from stack, enter the main loop
+ irp _rN_,<0,1,2,3,4> ;(no need to re-load x6)
+ mov r1K_x&_rN_,[rsp+X_stk+8*_rN_] ;re-load state and get ready to go!
+ endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL ;show state after initial key injection
+endif
+ ;
+ ;;;;;;;;;;;;;;;;;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT = ROUNDS_1024/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL1024"
+ endif
+Skein1024_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2 ;implement the rounds, 4 at a time
+ R1024_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+ cmp qword ptr [rsp+tmpStk_1024],2*(ROUNDS_1024/8) ;see if we are done
+ jb Skein1024_round_loop
+endif
+ ; end of rounds
+ ;;;;;;;;;;;;;;;;;
+ ;
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ mov [rsp+X_stk+8*7],r1K_x7 ;we need a register. x6 already on stack
+ mov r1K_x7,[rsp+ctxPtr]
+
+ irp _rN_,<0,1,2,3,4,5,8,9,A,B,C,D,E,F> ;do all but x6,x7
+ xor r1K_x&_rN_,[rsp +Wcopy +8*(0&_rN_&h)] ;feedforward XOR
+ mov [r1K_x7+X_VARS+8*(0&_rN_&h)],r1K_x&_rN_ ;save result into context
+ if (0&_rN_&h eq 9)
+ mov r09,FIRST_MASK
+ endif
+ if (0&_rN_&h eq 0eh)
+ and r09,[r1K_x7+TWEAK+ 8]
+ endif
+ endm
+ ;
+ mov rax,[rsp+X_stk +8*6] ;now process x6,x7
+ mov rbx,[rsp+X_stk +8*7]
+ xor rax,[rsp+Wcopy +8*6]
+ xor rbx,[rsp+Wcopy +8*7]
+ mov [r1K_x7+X_VARS+8*6],rax
+ dec qword ptr [rsp+blkCnt] ;set zero flag iff done
+ mov [r1K_x7+X_VARS+8*7],rbx
+
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmp qword ptr [rsp+blkCnt],0>
+ ; go back for more blocks, if needed
+ mov rdi,[rsp+ctxPtr] ;don't muck with the flags here!
+ lea rbp,[rsp+FRAME_OFFS]
+ jnz Skein1024_block_loop
+ mov [r1K_x7+TWEAK+ 8],r09
+ Reset_Stack Skein1024_Process_Block
+ ret
+;
+if _SKEIN_DEBUG
+; call here with r08 = "round number"
+Skein_Debug_Round_1024:
+_SP_OFFS_ = 8*2 ;stack "offset" here: r08, return addr
+ SP_ equ <rsp + _SP_OFFS_> ;useful shorthand below
+;
+ irp _wN_,<1,2,3,5,7,9,A,B,C,D,E,F> ;save rest of X[] state on stack so debug routines can access it
+ mov [SP_+X_stk+8*(0&_wN_&h)],r1K_x&_wN_
+ endm
+ ;figure out what to do with x0. On rounds R where R==0 mod 4, it's already on the stack
+ cmp r08,SKEIN_RND_SPECIAL ;special rounds always save
+ jae save_x0
+ test r08,3
+ jz save_x0_not
+save_x0:
+ mov [SP_+X_stk+8*0],r1K_x0
+save_x0_not:
+ ;figure out the x4/x6 swapping state and save the correct one!
+ cmp r08,SKEIN_RND_SPECIAL ;special rounds always do x4
+ jae save_x4
+ test r08,1 ;and even ones have r4 as well
+ jz save_x4
+ mov [SP_+X_stk+8*6],r1K_x6
+ jmp short debug_1024_go
+save_x4:
+ mov [SP_+X_stk+8*4],r1K_x4
+debug_1024_go:
+ ;now all is saved in Xstk[] except for X8
+ push rdx ;save two regs for BLK_BITS-specific parms
+ push rcx
+_SP_OFFS_ = _SP_OFFS_ + 16 ;adjust stack offset accordingly
+ ; now stack offset is 32 to X_stk
+ mov rcx,[SP_ - 8] ;get back original r08 (pushed on stack in macro call)
+ mov [SP_+X_stk+8*8],rcx ;and save it in its rightful place in X_stk[8]
+ mov rdx,[SP_+ctxPtr] ;ctx_hdr_ptr
+ mov rcx, 1024 ;block size
+ jmp Skein_Debug_Round_Common
+endif
+;
+Skein1024_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+ public Skein1024_Process_Block_CodeSize
+Skein1024_Process_Block_CodeSize proc
+ mov rax,_ProcBytes_
+ ret
+Skein1024_Process_Block_CodeSize endp
+;
+ public Skein1024_Unroll_Cnt
+Skein1024_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_1024/8
+ mov rax,_UNROLL_CNT
+ else
+ xor rax,rax
+ endif
+ ret
+Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;
+if _SKEIN_DEBUG
+;----------------------------------------------------------------
+;local debug routine to set up for calls to:
+; void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+; here with r08 = round number
+; rdx = ctx_hdr_ptr
+; rcx = block size (256/512/1024)
+;
+Skein_Debug_Round_Common:
+_SP_OFFS_ = 32 ;current stack "offset": r08, retAddr, rcx, rdx
+ irp _rr_,<rax,rbx,rsi,rdi,rbp,r09,r10,r11,r12,r13,r14,r15> ;save the rest of the regs
+ push _rr_
+_SP_OFFS_ = _SP_OFFS_+8
+ endm
+ if (_SP_OFFS_ and 0Fh) ; make sure stack is still 16-byte aligned here
+ .err "Debug_Round_Common: stack alignment"
+ endif
+ ; compute r09 = ptr to the X[] array on the stack
+ lea r09,[SP_+X_stk] ;adjust for reg pushes, return address
+ cmp r08,SKEIN_RND_FEED_FWD ;special handling for feedforward "round"?
+ jnz _got_r09a
+ lea r09,[rdx+X_VARS]
+_got_r09a:
+ if _USE_ASM_ and 1024
+ ; special handling for 1024-bit case
+ ; (for rounds right before with key injection:
+ ; use xDebug_1024[] instead of X_stk[])
+ cmp r08,SKEIN_RND_SPECIAL
+ jae _got_r09b ;must be a normal round
+ or r08,r08
+ jz _got_r09b ;just before key injection
+ test r08,3
+ jne _got_r09b
+ cmp rcx,1024 ;only 1024-bit(s) for now
+ jne _got_r09b
+ lea r09,[SP_+xDebug_1024]
+_got_r09b:
+ endif
+ sub rsp, 8*4 ;make room for parms on stack
+ call Skein_Show_Round ;call external debug handler
+ add rsp, 8*4 ;discard parm space on the stack
+
+ irp _rr_,<r15,r14,r13,r12,r11,r10,r09,rbp,rdi,rsi,rbx,rax> ;restore regs
+ pop _rr_
+_SP_OFFS_ = _SP_OFFS_-8
+ endm
+ if _SP_OFFS_ - 32
+ .err "Debug_Round_Common: push/pop misalignment!"
+ endif
+ pop rcx
+ pop rdx
+ ret
+endif
+;----------------------------------------------------------------
+ end
diff --git a/Additional_Implementations/skein_block_x64.s b/Additional_Implementations/skein_block_x64.s
new file mode 100644
index 000000000000..b2d0a83acbe9
--- /dev/null
+++ b/Additional_Implementations/skein_block_x64.s
@@ -0,0 +1,1328 @@
+#
+#----------------------------------------------------------------
+# 64-bit x86 assembler code (gnu as) for Skein block functions
+#
+# Author: Doug Whiting, Hifn/Exar
+#
+# This code is released to the public domain.
+#----------------------------------------------------------------
+#
+ .text
+ .altmacro
+ .psize 0,128 #list file has no page boundaries
+#
+_MASK_ALL_ = (256+512+1024) #all three algorithm bits
+_MAX_FRAME_ = 240
+#
+#################
+.ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+.else
+_USE_ASM_ = SKEIN_USE_ASM
+.endif
+#################
+.ifndef SKEIN_LOOP #configure loop unrolling
+_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024
+.else
+_SKEIN_LOOP = SKEIN_LOOP
+ .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line
+.print "+++ SKEIN_LOOP = \_NN_"
+ .endr
+.endif
+# the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10
+#
+SKEIN_ASM_UNROLL = 0
+ .irp _NN_,256,512,1024
+ .if (SKEIN_UNROLL_\_NN_) == 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_
+ .endif
+ .endr
+#################
+#
+.ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+.else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)
+# only display rounds if default size is changed on command line
+.irp _NN_,256,512,1024
+ .if _USE_ASM_ && \_NN_
+ .irp _RR_,%(ROUNDS_\_NN_)
+ .if _NN_ < 1024
+.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+ .else
+.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+ .endif
+ .endr
+ .endif
+.endr
+.endif
+#################
+#
+.ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE = (1)
+.else
+.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE = (1)
+.else
+_SKEIN_CODE_SIZE = (0)
+.endif
+.endif
+#
+#################
+#
+.ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+.else
+_SKEIN_DEBUG = 1
+.endif
+#################
+#
+# define offsets of fields in hash context structure
+#
+HASH_BITS = 0 #bits of hash output
+BCNT = 8 + HASH_BITS #number of bytes in BUFFER[]
+TWEAK = 8 + BCNT #tweak values[0..1]
+X_VARS = 16 + TWEAK #chaining vars
+#
+#(Note: buffer[] in context structure is NOT needed here :-)
+#
+KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words
+FIRST_MASK = ~ (1 << 6)
+FIRST_MASK64= ~ (1 << 62)
+#
+# rotation constants for Skein
+#
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+#
+# Input: reg
+# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+#
+.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM
+ .if _RCNT_ #is there anything to do?
+ rolq $_RCNT_,%\reg
+ .endif
+.endm
+#
+#----------------------------------------------------------------
+#
+# MACROS: define local vars and configure stack
+#
+#----------------------------------------------------------------
+# declare allocated space on the stack
+.macro StackVar localName,localSize
+\localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(\localSize)
+.endm #StackVar
+#
+#----------------------------------------------------------------
+#
+# MACRO: Configure stack frame, allocate local vars
+#
+.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
+ WCNT = (\BLK_BITS)/64
+#
+_PushCnt_ = 0 #save nonvolatile regs on stack
+ .irp _reg_,rbp,rbx,r12,r13,r14,r15
+ pushq %\_reg_
+_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment
+ .endr
+#
+_STK_OFFS_ = 0 #starting offset from rsp
+ #---- local variables #<-- rsp
+ StackVar X_stk ,8*(WCNT) #local context vars
+ StackVar ksTwk ,8*3 #key schedule: tweak words
+ StackVar ksKey ,8*(WCNT)+8 #key schedule: key words
+ .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
+ StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
+ .endif
+ StackVar Wcopy ,8*(WCNT) #copy of input block
+ .if _SKEIN_DEBUG
+ .if \debugCnt + 0 #temp location for debug X[] info
+ StackVar xDebug_\BLK_BITS ,8*(\debugCnt)
+ .endif
+ .endif
+ .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
+ StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?)
+tmpStk_\BLK_BITS = align16 #use this
+ .endif
+ #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
+ StackVar ctxPtr ,8 #context ptr
+ StackVar blkPtr ,8 #pointer to block data
+ StackVar blkCnt ,8 #number of full blocks to process
+ StackVar bitAdd ,8 #bit count to add to tweak
+LOCAL_SIZE = _STK_OFFS_ #size of "local" vars
+ #----
+ StackVar savRegs,8*_PushCnt_ #saved registers
+ StackVar retAddr,8 #return address
+ #---- caller's stack frame (aligned mod 16)
+#
+# set up the stack frame pointer (rbp)
+#
+FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey
+ .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range
+FRAME_OFFS = _STK_OFFS_
+ .endif
+F_O = -FRAME_OFFS
+#
+ #put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
+#
+# Notes on stack frame setup:
+# * the most frequently used variable is X_stk[], based at [rsp+0]
+# * the next most used is the key schedule arrays, ksKey and ksTwk
+# so rbp is "centered" there, allowing short offsets to the key
+# schedule even in 1024-bit Skein case
+# * the Wcopy variables are infrequently accessed, but they have long
+# offsets from both rsp and rbp only in the 1024-bit case.
+# * all other local vars and calling parameters can be accessed
+# with short offsets, except in the 1024-bit case
+#
+ subq $LOCAL_SIZE,%rsp #make room for the locals
+ leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets
+ movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack
+ movq %rsi, blkPtr+F_O(%rbp)
+ movq %rdx, blkCnt+F_O(%rbp)
+ movq %rcx, bitAdd+F_O(%rbp)
+#
+.endm #Setup_Stack
+#
+#----------------------------------------------------------------
+#
+.macro Reset_Stack
+ addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??)
+ .irp _reg_,r15,r14,r13,r12,rbx,rbp
+ popq %\_reg_ #restore caller's regs
+_PushCnt_ = _PushCnt_ - 1
+ .endr
+ .if _PushCnt_
+ .error "Mismatched push/pops?"
+ .endif
+.endm # Reset_Stack
+#
+#----------------------------------------------------------------
+# macros to help debug internals
+#
+.if _SKEIN_DEBUG
+ .extern Skein_Show_Block #calls to C routines
+ .extern Skein_Show_Round
+#
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+#
+.macro Skein_Debug_Block BLK_BITS
+#
+#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+# const u08b_t *blkPtr, const u64b_t *wPtr,
+# const u64b_t *ksPtr,const u64b_t *tsPtr)
+#
+_NN_ = 0
+ .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
+ pushq %\_reg_ #save all volatile regs on tack before the call
+_NN_ = _NN_ + 1
+ .endr
+ # get and push call parameters
+ movq $\BLK_BITS ,%rdi #bits
+ movq ctxPtr+F_O(%rbp),%rsi #h (pointer)
+ leaq X_VARS (%rsi),%rdx #X (pointer)
+ movq blkPtr+F_O(%rbp),%rcx #blkPtr
+ leaq Wcopy +F_O(%rbp),%r8 #wPtr
+ leaq ksKey +F_O(%rbp),%r9 #key pointer
+ leaq ksTwk +F_O(%rbp),%rax #tweak pointer
+ pushq %rax # (pass on the stack)
+ call Skein_Show_Block #call external debug handler
+ addq $8*1,%rsp #discard parameters on stack
+ .if (_NN_ % 2 ) == 0 #check stack alignment
+ .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
+ .endif
+ .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
+ popq %\_reg_ #restore regs
+_NN_ = _NN_ - 1
+ .endr
+ .if _NN_
+ .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
+ .endif
+.endm # Skein_Debug_Block
+#
+# the macro to "call" to debug a round
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+ # call the appropriate (local) debug "function"
+ pushq %rdx #save rdx, so we can use it for round "number"
+ .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
+ movq $\R,%rdx
+ .else #compute round number using edi
+_rOffs_ = \RDI_OFFS + 0
+ .if \BLK_BITS == 1024
+ movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above)
+ leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
+ .else
+ leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
+ .endif
+ .endif
+ call Skein_Debug_Round_\BLK_BITS
+ popq %rdx #restore origianl rdx value
+#
+ afterOp
+.endm # Skein_Debug_Round
+.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+.macro Skein_Debug_Block BLK_BITS
+.endm
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+.endm
+#
+.endif # _SKEIN_DEBUG
+#
+#----------------------------------------------------------------
+#
+.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+ .if \immOffs + 0
+ leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+ .elseif ((\useAddOp + 0) == 0)
+ .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs!
+ leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+ .else
+ addq %\srcReg_A\srcReg_B,%\dstReg
+ .endif
+ .else
+ addq %\srcReg_A\srcReg_B,%\dstReg
+ .endif
+.endm
+
+# keep Intel-style ordering here, to match addReg
+.macro xorReg dstReg,srcReg_A,srcReg_B
+ xorq %\srcReg_A\srcReg_B,%\dstReg
+.endm
+#
+#----------------------------------------------------------------
+#
+.macro C_label lName
+ \lName: #use both "genders" to work across linkage conventions
+_\lName:
+ .global \lName
+ .global _\lName
+.endm
+#
+#=================================== Skein_256 =============================================
+#
+.if _USE_ASM_ & 256
+#
+# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+#
+# code
+#
+C_label Skein_256_Process_Block
+ Setup_Stack 256,((ROUNDS_256/8)+1)
+ movq TWEAK+8(%rdi),%r14
+ jmp Skein_256_block_loop
+ .p2align 4
+ # main hash loop for Skein_256
+Skein_256_block_loop:
+ #
+ # general register usage:
+ # RAX..RDX = X0..X3
+ # R08..R12 = ks[0..4]
+ # R13..R15 = ts[0..2]
+ # RSP, RBP = stack/frame pointers
+ # RDI = round counter or context pointer
+ # RSI = temp
+ #
+ movq TWEAK+0(%rdi) ,%r13
+ addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0
+ movq %r14 ,%r15
+ xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak
+
+ movq $KW_PARITY ,%r12
+ movq X_VARS+ 0(%rdi),%r8
+ movq X_VARS+ 8(%rdi),%r9
+ movq X_VARS+16(%rdi),%r10
+ movq X_VARS+24(%rdi),%r11
+ movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0]
+ xorq %r8 ,%r12 #start accumulating overall parity
+
+ movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block
+ xorq %r9 ,%r12
+ movq 0(%rsi) ,%rax #get X[0..3]
+ xorq %r10 ,%r12
+ movq 8(%rsi) ,%rbx
+ xorq %r11 ,%r12
+ movq 16(%rsi) ,%rcx
+ movq 24(%rsi) ,%rdx
+
+ movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block
+ movq %rbx,Wcopy+ 8+F_O(%rbp)
+ movq %rcx,Wcopy+16+F_O(%rbp)
+ movq %rdx,Wcopy+24+F_O(%rbp)
+
+ addq %r8 ,%rax #initial key injection
+ addq %r9 ,%rbx
+ addq %r10,%rcx
+ addq %r11,%rdx
+ addq %r13,%rbx
+ addq %r14,%rcx
+
+.if _SKEIN_DEBUG
+ movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?)
+ movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block
+ movq %r9 ,ksKey+ 8+F_O(%rbp)
+ movq %r10,ksKey+16+F_O(%rbp)
+ movq %r11,ksKey+24+F_O(%rbp)
+ movq %r12,ksKey+32+F_O(%rbp)
+
+ movq %r13,ksTwk+ 0+F_O(%rbp)
+ movq %r14,ksTwk+ 8+F_O(%rbp)
+ movq %r15,ksTwk+16+F_O(%rbp)
+
+ movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block
+ movq %rbx,X_stk + 8(%rsp)
+ movq %rcx,X_stk +16(%rsp)
+ movq %rdx,X_stk +24(%rsp)
+
+ Skein_Debug_Block 256 #debug dump
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+.endif
+#
+.if ((SKEIN_ASM_UNROLL & 256) == 0)
+ movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code
+ movq %r9 ,ksKey+ 8+F_O(%rbp)
+ movq %r10,ksKey+16+F_O(%rbp)
+ movq %r11,ksKey+24+F_O(%rbp)
+ movq %r12,ksKey+32+F_O(%rbp)
+
+ movq %r13,ksTwk+24+F_O(%rbp)
+ movq %r14,ksTwk+ 8+F_O(%rbp)
+ movq %r15,ksTwk+16+F_O(%rbp)
+.endif
+ addq $WCNT*8,%rsi #skip the block
+ movq %rsi,blkPtr +F_O(%rbp) #update block pointer
+ #
+ # now the key schedule is computed. Start the rounds
+ #
+.if SKEIN_ASM_UNROLL & 256
+_UNROLL_CNT = ROUNDS_256/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_256
+ .if ((ROUNDS_256/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_256"
+ .endif
+ xorq %rdi,%rdi #rdi = iteration count
+Skein_256_round_loop:
+.endif
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+ # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled)
+ # round 4*_RBase_ + 0
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0
+ addReg rcx, rdx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
+ .endif
+ xorReg rbx, rax
+ RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1
+ xorReg rdx, rcx
+ .if SKEIN_ASM_UNROLL & 256
+ .irp _r0_,%( 8+(_Rbase_+3) % 5)
+ .irp _r1_,%(13+(_Rbase_+2) % 3)
+ leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx
+ .endr
+ .endr
+ .endif
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
+ .endif
+ Skein_Debug_Round 256,%(4*_Rbase_+1)
+
+ # round 4*_Rbase_ + 1
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0
+ xorReg rdx, rax
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
+ .endif
+ addReg rcx, rbx
+ RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1
+ xorReg rbx, rcx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
+ .endif
+ Skein_Debug_Round 256,%(4*_Rbase_+2)
+ .if SKEIN_ASM_UNROLL & 256
+ .irp _r0_,%( 8+(_Rbase_+2) % 5)
+ .irp _r1_,%(13+(_Rbase_+1) % 3)
+ leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx
+ .endr
+ .endr
+ .endif
+ # round 4*_Rbase_ + 2
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0
+ addReg rcx, rdx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
+ .endif
+ xorReg rbx, rax
+ RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1
+ xorReg rdx, rcx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key
+ leaq 1(%r11,%rdi),%r11 #precompute key + tweak
+ .endif
+ Skein_Debug_Round 256,%(4*_Rbase_+3)
+ # round 4*_Rbase_ + 3
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0
+ addReg rcx, rbx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak
+ movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak
+ .endif
+ xorReg rdx, rax
+ RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1
+ xorReg rbx, rcx
+ Skein_Debug_Round 256,%(4*_Rbase_+4)
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ addReg r9 ,r13 #precompute key+tweak
+ .endif
+ #inject key schedule words
+_Rbase_ = _Rbase_+1
+ .if SKEIN_ASM_UNROLL & 256
+ addReg rax,r,%(8+((_Rbase_+0) % 5))
+ addReg rbx,rsi
+ addReg rcx,rdi
+ addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
+ .else
+ incq %rdi
+ addReg rax,r8
+ addReg rcx,r10
+ addReg rbx,r9
+ addReg rdx,r11
+ .endif
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 256) == 0
+ cmpq $2*(ROUNDS_256/8),%rdi
+ jb Skein_256_round_loop
+.endif # (SKEIN_ASM_UNROLL & 256) == 0
+ movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
+
+ #----------------------------
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ movq $FIRST_MASK64 ,%r14
+ xorq Wcopy + 0+F_O (%rbp),%rax
+ xorq Wcopy + 8+F_O (%rbp),%rbx
+ xorq Wcopy +16+F_O (%rbp),%rcx
+ xorq Wcopy +24+F_O (%rbp),%rdx
+ andq TWEAK + 8 (%rdi),%r14
+ movq %rax,X_VARS+ 0(%rdi) #store final result
+ movq %rbx,X_VARS+ 8(%rdi)
+ movq %rcx,X_VARS+16(%rdi)
+ movq %rdx,X_VARS+24(%rdi)
+
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
+
+ # go back for more blocks, if needed
+ decq blkCnt+F_O(%rbp)
+ jnz Skein_256_block_loop
+ movq %r14,TWEAK + 8(%rdi)
+ Reset_Stack
+ ret
+Skein_256_Process_Block_End:
+
+ .if _SKEIN_DEBUG
+Skein_Debug_Round_256: #here with rdx == round "number" from macro
+ pushq %rsi #save two regs for BLK_BITS-specific parms
+ pushq %rdi
+ movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi
+ movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it
+ movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!)
+ movq %rcx,X_stk+16+F_O(%rbp)
+ movq %rdi,X_stk+24+F_O(%rbp)
+
+ movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
+ movq $256,%rdi #now <rdi,rsi,rdx> are set for the call
+ jmp Skein_Debug_Round_Common
+ .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein_256_Process_Block_CodeSize
+ movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
+ ret
+#
+C_label Skein_256_Unroll_Cnt
+ .if _UNROLL_CNT <> ROUNDS_256/8
+ movq $_UNROLL_CNT,%rax
+ .else
+ xorq %rax,%rax
+ .endif
+ ret
+.endif
+#
+.endif #_USE_ASM_ & 256
+#
+#=================================== Skein_512 =============================================
+#
+.if _USE_ASM_ & 512
+#
+# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
+#
+# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7)
+#
+#################
+# MACRO: one round for 512-bit blocks
+#
+.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
+#
+ addReg r\rn0, r\rn1
+ RotL64 r\rn1, 512,%((_Rn_) % 8),0
+ xorReg r\rn1, r\rn0
+ op1
+ addReg r\rn2, r\rn3
+ RotL64 r\rn3, 512,%((_Rn_) % 8),1
+ xorReg r\rn3, r\rn2
+ op2
+ addReg r\rn4, r\rn5
+ RotL64 r\rn5, 512,%((_Rn_) % 8),2
+ xorReg r\rn5, r\rn4
+ op3
+ addReg r\rn6, r\rn7
+ RotL64 r\rn7, 512,%((_Rn_) % 8),3
+ xorReg r\rn7, r\rn6
+ op4
+ Skein_Debug_Round 512,%(_Rn_+1),-4
+#
+.endm #R_512_OneRound
+#
+#################
+# MACRO: eight rounds for 512-bit blocks
+#
+.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)
+ .if (SKEIN_ASM_UNROLL && 512)
+ # here for fully unrolled case.
+ _II_ = ((_RR_)/4) + 1 #key injection counter
+ R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
+ R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
+ R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
+ R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
+ # inject the key schedule
+ addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
+ addReg r11, rax
+ addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
+ addReg r12, rbx
+ addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
+ addReg r13, rcx
+ addReg r14, rdx
+ addReg r15, rsi,,,(_II_)
+ .else
+ # here for looping case #"rotate" key/tweak schedule (move up on stack)
+ incq %rdi #bump key injection counter
+ R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
+ R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
+ R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
+ R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
+ # inject the key schedule
+ addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8
+ addReg r11, rax
+ addReg r12, rbx
+ addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9
+ addReg r13, rcx
+ addReg r14, rdx
+ addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10
+ addReg r15, rsi
+ addReg r15, rdi #inject the round number
+ .endif
+
+ #show the result of the key injection
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
+.endm #R_512_EightRounds
+#
+#################
+# instantiated code
+#
+C_label Skein_512_Process_Block
+ Setup_Stack 512,ROUNDS_512/8
+ movq TWEAK+ 8(%rdi),%rbx
+ jmp Skein_512_block_loop
+ .p2align 4
+ # main hash loop for Skein_512
+Skein_512_block_loop:
+ # general register usage:
+ # RAX..RDX = temps for key schedule pre-loads
+ # R8 ..R15 = X0..X7
+ # RSP, RBP = stack/frame pointers
+ # RDI = round counter or context pointer
+ # RSI = temp
+ #
+ movq TWEAK + 0(%rdi),%rax
+ addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0
+ movq %rbx,%rcx
+ xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule
+ movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0]
+ movq %rax,ksTwk+ 0+F_O(%rbp)
+ movq $KW_PARITY,%rdx
+ movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block
+ movq %rbx,ksTwk+ 8+F_O(%rbp)
+ movq %rcx,ksTwk+16+F_O(%rbp)
+ .irp _Rn_,8,9,10,11,12,13,14,15
+ movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_
+ xorq %r\_Rn_,%rdx #compute overall parity
+ movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp)
+ .endr #load state into %r8 ..%r15, compute parity
+ movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
+
+ addReg r13,rax #precompute key injection for tweak
+ addReg r14, rbx
+.if _SKEIN_DEBUG
+ movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
+.endif
+ movq 0(%rsi),%rax #load input block
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rcx
+ movq 24(%rsi),%rdx
+ addReg r8 , rax #do initial key injection
+ addReg r9 , rbx
+ movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward
+ movq %rbx,Wcopy+ 8+F_O(%rbp)
+ addReg r10, rcx
+ addReg r11, rdx
+ movq %rcx,Wcopy+16+F_O(%rbp)
+ movq %rdx,Wcopy+24+F_O(%rbp)
+
+ movq 32(%rsi),%rax
+ movq 40(%rsi),%rbx
+ movq 48(%rsi),%rcx
+ movq 56(%rsi),%rdx
+ addReg r12, rax
+ addReg r13, rbx
+ addReg r14, rcx
+ addReg r15, rdx
+ movq %rax,Wcopy+32+F_O(%rbp)
+ movq %rbx,Wcopy+40+F_O(%rbp)
+ movq %rcx,Wcopy+48+F_O(%rbp)
+ movq %rdx,Wcopy+56+F_O(%rbp)
+
+.if _SKEIN_DEBUG
+ .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output
+ movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp)
+ .endr
+
+ Skein_Debug_Block 512 #debug dump
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
+.endif
+ addq $8*WCNT,%rsi #skip the block
+ movq %rsi,blkPtr+F_O(%rbp) #update block pointer
+ #
+ #################
+ # now the key schedule is computed. Start the rounds
+ #
+.if SKEIN_ASM_UNROLL & 512
+_UNROLL_CNT = ROUNDS_512/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ .if ((ROUNDS_512/8) % _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_512"
+ .endif
+ xorq %rdi,%rdi #rdi = round counter
+Skein_512_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+ R_512_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 512) == 0
+ cmpq $2*(ROUNDS_512/8),%rdi
+ jb Skein_512_round_loop
+ movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
+.endif
+ # end of rounds
+ #################
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+ .irp _Rn_,8,9,10,11,12,13,14,15
+ .if (_Rn_ == 8)
+ movq $FIRST_MASK64,%rbx
+ .endif
+ xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
+ movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result
+ .if (_Rn_ == 14)
+ andq TWEAK+ 8(%rdi),%rbx
+ .endif
+ .endr
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+
+ # go back for more blocks, if needed
+ decq blkCnt+F_O(%rbp)
+ jnz Skein_512_block_loop
+ movq %rbx,TWEAK + 8(%rdi)
+
+ Reset_Stack
+ ret
+Skein_512_Process_Block_End:
+#
+ .if _SKEIN_DEBUG
+# call here with rdx = "round number"
+Skein_Debug_Round_512:
+ pushq %rsi #save two regs for BLK_BITS-specific parms
+ pushq %rdi
+ .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it
+ movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp)
+ .endr
+ movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
+ movq $512,%rdi #now <rdi,rsi,rdx> are set for the call
+ jmp Skein_Debug_Round_Common
+ .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein_512_Process_Block_CodeSize
+ movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
+ ret
+#
+C_label Skein_512_Unroll_Cnt
+ .if _UNROLL_CNT <> (ROUNDS_512/8)
+ movq $_UNROLL_CNT,%rax
+ .else
+ xorq %rax,%rax
+ .endif
+ ret
+.endif
+#
+.endif # _USE_ASM_ & 512
+#
+#=================================== Skein1024 =============================================
+.if _USE_ASM_ & 1024
+#
+# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+# use details of permutation to make register assignments
+#
+o1K_rdi = 0 #offsets in X[] associated with each register
+o1K_rsi = 1
+o1K_rbp = 2
+o1K_rax = 3
+o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate
+o1K_rbx = 5
+o1K_rdx = 7
+o1K_r8 = 8
+o1K_r9 = 9
+o1K_r10 = 10
+o1K_r11 = 11
+o1K_r12 = 12
+o1K_r13 = 13
+o1K_r14 = 14
+o1K_r15 = 15
+#
+rIdx_offs = tmpStk_1024
+#
+.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
+ addReg \reg0 , \reg1 #perform the MIX
+ RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_
+ xorReg \reg1 , \reg0
+.if ((_RN0_) && 3) == 3 #time to do key injection?
+ .if _SKEIN_DEBUG
+ movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round
+ movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection)
+ .endif
+_II_ = ((_RN0_)/4)+1 #injection count
+ .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection
+ addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0
+ addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1
+ .if w1 == 13 #tweak injection
+ addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1
+ .elseif w0 == 14
+ addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0
+ .elseif w1 == 15
+ addq $_II_, %\reg1 #(injection counter)
+ .endif
+ .else #here to do looping key injection
+ .if (w0 == 0)
+ movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index
+ movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi
+ .else
+ addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection
+ .endif
+ .if w1 == 13 #tweak injection
+ addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
+ .elseif w0 == 14
+ addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
+ .elseif w1 == 15
+ addReg \reg1,rdi,,,1 #(injection counter)
+ .endif
+ addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection
+ .endif
+.endif
+ # insert the op provided, .if any
+ op1
+.endm
+#################
+# MACRO: four rounds for 1024-bit blocks
+#
+.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)
+ # should be here with X4 set properly, X6 stored on stack
+_Rn_ = (_RR_) + 0
+ r1024_Mix 0, 1,rdi,rsi,_Rn_,0
+ r1024_Mix 2, 3,rbp,rax,_Rn_,1
+ r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
+ r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
+ r1024_Mix 10,11,r10,r11,_Rn_,5
+ r1024_Mix 12,13,r12,r13,_Rn_,6
+ r1024_Mix 6, 7,rcx,rdx,_Rn_,3
+ r1024_Mix 14,15,r14,r15,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+_Rn_ = (_RR_) + 1
+ r1024_Mix 0, 9,rdi,r9 ,_Rn_,0
+ r1024_Mix 2,13,rbp,r13,_Rn_,1
+ r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
+ r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
+ r1024_Mix 12, 3,r12,rax,_Rn_,5
+ r1024_Mix 14, 5,r14,rbx,_Rn_,6
+ r1024_Mix 4,15,rcx,r15,_Rn_,3
+ r1024_Mix 8, 1,r8 ,rsi,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+_Rn_ = (_RR_) + 2
+ r1024_Mix 0, 7,rdi,rdx,_Rn_,0
+ r1024_Mix 2, 5,rbp,rbx,_Rn_,1
+ r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
+ r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
+ r1024_Mix 14,13,r14,r13,_Rn_,5
+ r1024_Mix 8,11,r8 ,r11,_Rn_,6
+ r1024_Mix 6, 1,rcx,rsi,_Rn_,3
+ r1024_Mix 10, 9,r10,r9 ,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+_Rn_ = (_RR_) + 3
+ r1024_Mix 0,15,rdi,r15,_Rn_,0
+ r1024_Mix 2,11,rbp,r11,_Rn_,1
+ r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
+ r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
+ r1024_Mix 8, 5,r8 ,rbx,_Rn_,5
+ r1024_Mix 10, 3,r10,rax,_Rn_,6
+ r1024_Mix 4, 9,rcx,r9 ,_Rn_,3
+ r1024_Mix 12, 7,r12,rdx,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+
+ .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack
+ #"rotate" the key schedule on the stack
+i8 = o1K_r8
+i0 = o1K_rdi
+ movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack)
+ movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word
+ movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!)
+ movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word
+ movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack)
+ movq X_stk+8*i8(%rsp) ,%r8 #get the reg back
+ incq %rdi #bump the index
+ movq %rdi, rIdx_offs (%rsp) #save rdi again
+ movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back
+ addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection
+ .endif
+ #show the result of the key injection
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
+.endm #r1024_FourRounds
+#
+################
+# code
+#
+C_label Skein1024_Process_Block
+#
+ Setup_Stack 1024,ROUNDS_1024/8,WCNT
+ movq TWEAK+ 8(%rdi),%r9
+ jmp Skein1024_block_loop
+ # main hash loop for Skein1024
+ .p2align 4
+Skein1024_block_loop:
+ # general register usage:
+ # RSP = stack pointer
+ # RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
+ # R8 ..R15 = X8..X15 (state words)
+ # RBP = temp (used for X0 and X2)
+ #
+ .if (SKEIN_ASM_UNROLL & 1024) == 0
+ xorq %rax,%rax #init loop index on the stack
+ movq %rax,rIdx_offs(%rsp)
+ .endif
+ movq TWEAK+ 0(%rdi),%r8
+ addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0
+ movq %r9 ,%r10
+ xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule
+ movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0]
+ movq %r8 ,ksTwk+ 0+F_O(%rbp)
+ movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below
+ movq %r10,ksTwk+16+F_O(%rbp)
+ .if _SKEIN_DEBUG
+ movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
+ .endif
+ movq blkPtr +F_O(%rbp),%rsi # rsi --> input block
+ movq $KW_PARITY ,%rax #overall key schedule parity
+
+ # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
+ .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps
+ movq X_VARS+8*_rN_(%rdi),%r14 #get state word
+ movq 8*_rN_(%rsi),%r15 #get msg word
+ xorq %r14,%rax #update key schedule overall parity
+ movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack
+ movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy
+ addq %r15,%r14 #do the initial key injection
+ movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack
+ .endr
+ # now process the rest, using the "real" registers
+ # (MUST do it in reverse order to inject tweaks r8/r9 first)
+ .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
+_oo_ = o1K_\_rr_ #offset assocated with the register
+ movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context
+ movq 8*_oo_(%rsi),%rcx #get next input msg word
+ movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack
+ xorq %\_rr_, %rax #accumulate key schedule parity
+ movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward
+ addq %rcx,%\_rr_ #do the initial key injection
+ .if _oo_ == 13 #do the initial tweak injection
+ addReg _rr_,r8 # (only in words 13/14)
+ .elseif _oo_ == 14
+ addReg _rr_,r9
+ .endif
+ .endr
+ movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity
+.if _SKEIN_DEBUG
+ Skein_Debug_Block 1024 #initial debug dump
+.endif
+ addq $8*WCNT,%rsi #bump the msg ptr
+ movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr
+ # re-load words 0..4 from stack, enter the main loop
+ .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack)
+ movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
+ .endr
+.if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection
+.endif
+ #
+ #################
+ # now the key schedule is computed. Start the rounds
+ #
+.if SKEIN_ASM_UNROLL & 1024
+_UNROLL_CNT = ROUNDS_1024/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ .if ((ROUNDS_1024/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_1024"
+ .endif
+Skein1024_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time
+ r1024_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 1024) == 0
+ cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
+ jb Skein1024_round_loop
+.endif
+ # end of rounds
+ #################
+ #
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
+ movq ctxPtr(%rsp),%rdx
+
+ .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7
+_oo_ = o1K_\_rr_
+ xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
+ movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
+ .if (_oo_ == 9)
+ movq $FIRST_MASK64 ,%r9
+ .endif
+ .if (_oo_ == 14)
+ andq TWEAK+ 8(%rdx),%r9
+ .endif
+ .endr
+ #
+ movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
+ movq X_stk +8*7(%rsp),%rbx
+ xorq Wcopy +8*6(%rsp),%rax
+ xorq Wcopy +8*7(%rsp),%rbx
+ movq %rax,X_VARS+8*6(%rdx)
+ decq blkCnt(%rsp) #set zero flag iff done
+ movq %rbx,X_VARS+8*7(%rdx)
+
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
+ # go back for more blocks, if needed
+ movq ctxPtr(%rsp),%rdi #don't muck with the flags here!
+ lea FRAME_OFFS(%rsp),%rbp
+ jnz Skein1024_block_loop
+ movq %r9 ,TWEAK+ 8(%rdx)
+ Reset_Stack
+ ret
+#
+Skein1024_Process_Block_End:
+#
+.if _SKEIN_DEBUG
+Skein_Debug_Round_1024:
+ # call here with rdx = "round number",
+_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr
+ #
+ #save rest of X[] state on stack so debug routines can access it
+ .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
+ movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
+ .endr
+ # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack
+ cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save
+ jae save_x0
+ testq $3,%rdx #otherwise only if rdx != 0 mod 4
+ jz save_x0_not
+save_x0:
+ movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
+save_x0_not:
+ #figure out the x4/x6 swapping state and save the correct one!
+ cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
+ jae save_x4
+ testq $1,%rdx #and even ones have r4 as well
+ jz save_x4
+ movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
+ jmp debug_1024_go
+save_x4:
+ movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
+debug_1024_go:
+ #now all is saved in Xstk[] except for rdx
+ push %rsi #save two regs for BLK_BITS-specific parms
+ push %rdi
+_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32)
+
+ movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call)
+ movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
+
+ movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr
+ movq $1024,%rdi #rdi = block size
+ jmp Skein_Debug_Round_Common
+.endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein1024_Process_Block_CodeSize
+ movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
+ ret
+#
+C_label Skein1024_Unroll_Cnt
+ .if _UNROLL_CNT <> (ROUNDS_1024/8)
+ movq $_UNROLL_CNT,%rax
+ .else
+ xorq %rax,%rax
+ .endif
+ ret
+.endif
+#
+.endif # _USE_ASM_ and 1024
+#
+.if _SKEIN_DEBUG
+#----------------------------------------------------------------
+#local debug routine to set up for calls to:
+# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
+# [ rdi rsi rdx rcx]
+#
+# here with %rdx = round number
+# %rsi = ctx_hdr_ptr
+# %rdi = block size (256/512/1024)
+# on stack: saved rdi, saved rsi, retAddr, saved rdx
+#
+Skein_Debug_Round_Common:
+_SP_OFFS_ = 32 #account for four words on stack already
+ .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs
+ pushq %\_rr_
+_SP_OFFS_ = _SP_OFFS_+8
+ .endr
+ .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here
+ .error "Debug_Round_Common: stack alignment"
+ .endif
+ # compute %rcx = ptr to the X[] array on the stack (final parameter to call)
+ leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
+ cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"?
+ jnz _got_rcxA
+ leaq X_VARS(%rsi),%rcx
+_got_rcxA:
+ .if _USE_ASM_ & 1024
+ # special handling for 1024-bit case
+ # (for rounds right before with key injection:
+ # use xDebug_1024[] instead of X_stk[])
+ cmpq $SKEIN_RND_SPECIAL,%rdx
+ jae _got_rcxB #must be a normal round
+ orq %rdx,%rdx
+ jz _got_rcxB #just before key injection
+ test $3,%rdx
+ jne _got_rcxB
+ cmp $1024,%rdi #only 1024-bit(s) for now
+ jne _got_rcxB
+ leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx
+_got_rcxB:
+ .endif
+ call Skein_Show_Round #call external debug handler
+
+ .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs
+ popq %\_rr_
+_SP_OFFS_ = _SP_OFFS_-8
+ .endr
+ .if _SP_OFFS_ - 32
+ .error "Debug_Round_Common: push/pop misalignment!"
+ .endif
+ popq %rdi
+ popq %rsi
+ ret
+.endif
+#----------------------------------------------------------------
+ .end
diff --git a/Additional_Implementations/skein_block_x86.asm b/Additional_Implementations/skein_block_x86.asm
new file mode 100644
index 000000000000..4679e991fe04
--- /dev/null
+++ b/Additional_Implementations/skein_block_x86.asm
@@ -0,0 +1,1180 @@
+;
+;----------------------------------------------------------------
+; 32-bit x86 assembler code for Skein block functions
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+ .386p
+ .model flat
+ .code
+;
+_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_ = SKEIN_USE_ASM
+else
+_USE_ASM_ = _MASK_ALL_
+endif
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP
+_SKEIN_LOOP = 0 ;default is all fully unrolled
+else
+_SKEIN_LOOP = SKEIN_LOOP
+endif
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10
+;
+SKEIN_ASM_UNROLL = 0
+ irp _NN_,<256,512,1024>
+ if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_
+ endif
+ endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5)
+endif
+irp _NN_,<256,512,1024>
+ if _USE_ASM_ and _NN_
+ irp _RR_,<%(ROUNDS_&_NN_)>
+ if _NN_ eq 1024
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ else
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ endif
+ endm
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE equ (1)
+else
+ifdef SKEIN_PERF ;use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+else
+_SKEIN_DEBUG = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS = 0 ;# bits of hash output
+BCNT = 4 + HASH_BITS ;number of bytes in BUFFER[]
+TWEAK = 4 + BCNT ;tweak values[0..1]
+X_VARS = 16 + TWEAK ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+KW_PARITY_LO= 0A9FC1A22h ;overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI= 01BD11BDAh ;overall parity of key schedule words (hi32/lo32)
+FIRST_MASK = NOT (1 SHL 30) ;FIRST block flag bit
+;
+; rotation constants for Skein
+;
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+; Input: rHi,rLo
+; Output: <rHi,rLo> <<< _RCNT_
+Rol64 macro rHi,rLo,tmp,_RCNT_
+ if _RCNT_ ;is there anything to do?
+ if _RCNT_ lt 32
+ mov tmp,rLo
+ shld rLo,rHi,_RCNT_
+ shld rHi,tmp,_RCNT_
+ elseif _RCNT_ gt 32
+ mov tmp,rLo
+ shrd rLo,rHi,((64-_RCNT_) AND 63)
+ shrd rHi,tmp,((64-_RCNT_) AND 63)
+ else
+ xchg rHi,rLo ;special case for _RCNT_ == 32
+ endif
+ endif
+endm
+;
+; Input: rHi,rLo
+; Output: <rHi,rLo> <<< rName&&rNum, and tmp trashed;
+RotL64 macro rHi,rLo,tmp,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 )
+ Rol64 rHi,rLo,tmp,_RCNT_
+endm
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar macro localName,localSize
+localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro WCNT,KS_CNT
+_STK_OFFS_ = 0 ;starting offset from esp
+ ;----- local variables ;<-- esp
+ StackVar X_stk ,8*(WCNT) ;local context vars
+ StackVar Wcopy ,8*(WCNT) ;copy of input block
+ StackVar ksTwk ,8*3 ;key schedule: tweak words
+ StackVar ksKey ,8*(WCNT)+8 ;key schedule: key words
+ if WCNT le 8
+FRAME_OFFS = _STK_OFFS_ ;<-- ebp
+ else
+FRAME_OFFS = _STK_OFFS_-8*4 ;<-- ebp
+ endif
+ if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0
+ StackVar ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen
+ endif
+LOCAL_SIZE = _STK_OFFS_ ;size of local vars
+ ;-----
+ StackVar savRegs,8*4 ;pushad data
+ StackVar retAddr,4 ;return address
+ ;----- caller parameters
+ StackVar ctxPtr ,4 ;context ptr
+ StackVar blkPtr ,4 ;pointer to block data
+ StackVar blkCnt ,4 ;number of full blocks to process
+ StackVar bitAdd ,4 ;bit count to add to tweak
+ ;----- caller's stack frame
+;
+; Notes on stack frame setup:
+; * the most frequently used variable is X_stk[], based at [esp+0]
+; * the next most used is the key schedule words
+; so ebp is "centered" there, allowing short offsets to the key/tweak
+; schedule even in 1024-bit Skein case
+; * the Wcopy variables are infrequently accessed, but they have long
+; offsets from both esp and ebp only in the 1024-bit case.
+; * all other local vars and calling parameters can be accessed
+; with short offsets, except in the 1024-bit case
+;
+ pushad ;save all regs
+ sub esp,LOCAL_SIZE ;make room for the locals
+ lea ebp,[esp+FRAME_OFFS] ;maximize use of short offsets
+ mov edi,[FP_+ctxPtr ] ;edi --> context
+;
+endm ;Setup_Stack
+;
+FP_ equ <ebp-FRAME_OFFS> ;keep as many short offsets as possible
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro procStart
+ add esp,LOCAL_SIZE ;get rid of locals (wipe??)
+ popad ;restore all regs
+
+ ;display code size in bytes to stdout
+ irp _BCNT_,<%($+1-procStart)> ;account for return opcode
+if _BCNT_ ge 10000 ;(align it all pretty)
+%out procStart code size = _BCNT_ bytes
+elseif _BCNT_ ge 1000
+%out procStart code size = _BCNT_ bytes
+else
+%out procStart code size = _BCNT_ bytes
+endif
+ endm ;irp _BCNT_
+
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+ extrn _Skein_Show_Block:near ;calls to C routines
+ extrn _Skein_Show_Round:near
+;
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+; const u08b_t *blkPtr, const u64b_t *wPtr,
+; const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+ pushad ;save all regs
+ lea eax,[FP_+ksTwk]
+ lea ebx,[FP_+ksKey]
+ lea ecx,[esp+32+Wcopy]
+ mov edx,[FP_+ctxPtr] ;ctx_hdr_ptr
+ lea edx,[edx+X_VARS] ;edx ==> cxt->X[]
+ push eax ;tsPtr
+ push ebx ;ksPtr
+ push ecx ;wPtr
+ push dword ptr [FP_+blkPtr] ;blkPtr
+ push edx ;ctx->Xptr
+ push dword ptr [FP_+ctxPtr] ;ctx_hdr_ptr
+ mov eax,BLK_BITS
+ push eax ;bits
+ ifdef _MINGW_
+ call _Skein_Show_Block-4 ;strange linkage??
+ else
+ call _Skein_Show_Block
+ endif
+ add esp,7*4 ;discard parameter space on stack
+ popad ;restore regs
+endm ;Skein_Debug_Block
+
+;
+Skein_Debug_Round macro BLK_SIZE,R,saveRegs
+;
+;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+ ifnb <saveRegs>
+ mov [esp+X_stk+ 0],eax ;save internal vars for debug dump
+ mov [esp+X_stk+ 4],ebx
+ mov [esp+X_stk+ 8],ecx
+ mov [esp+X_stk+12],edx
+ endif
+ pushad ;save all regs
+ if R ne SKEIN_RND_FEED_FWD
+ lea eax,[esp+32+X_stk]
+ else
+ mov eax,[FP_+ctxPtr]
+ add eax,X_VARS
+ endif
+ push eax ;Xptr
+ if (SKEIN_ASM_UNROLL and BLK_SIZE) or (R ge SKEIN_RND_SPECIAL)
+ mov eax,R
+ else
+ lea eax,[4*edi+1+(((R)-1) and 3)] ;compute round number using edi
+ endif
+ push eax ;round number
+ push dword ptr [FP_+ctxPtr] ;ctx_hdr_ptr
+ mov eax,BLK_SIZE
+ push eax ;bits
+ ifdef _MINGW_
+ call _Skein_Show_Round-4 ;strange linkage??
+ else
+ call _Skein_Show_Round
+ endif
+ add esp,4*4 ;discard parameter space on stack
+ popad ;restore regs
+endm ;Skein_Debug_Round
+endif ;ifdef SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+;
+; MACRO: a mix step
+;
+MixStep macro BLK_SIZE,ld_A,ld_C,st_A,st_C,RotNum0,RotNum1,_debug_
+ ifnb <ld_A>
+ mov eax,[esp+X_stk+8*(ld_A)+0]
+ mov ebx,[esp+X_stk+8*(ld_A)+4]
+ endif
+ ifnb <ld_C>
+ mov ecx,[esp+X_stk+8*(ld_C)+0]
+ mov edx,[esp+X_stk+8*(ld_C)+4]
+ endif
+ add eax, ecx ;X[A] += X[C]
+ adc ebx, edx
+ ifnb <st_A>
+ mov [esp+X_stk+8*(st_A)+0],eax
+ mov [esp+X_stk+8*(st_A)+4],ebx
+ endif
+__rNum0 = (RotNum0) AND 7
+ RotL64 ecx, edx, esi,%(BLK_SIZE),%(__rNum0),%(RotNum1) ;X[C] <<<= RC_<BLK_BITS,RotNum0,RotNum1>
+ xor ecx, eax ;X[C] ^= X[A]
+ xor edx, ebx
+ if _SKEIN_DEBUG or (0 eq (_debug_ + 0))
+ ifb <st_C>
+ mov [esp+X_stk+8*(ld_C)+0],ecx
+ mov [esp+X_stk+8*(ld_C)+4],edx
+ else
+ mov [esp+X_stk+8*(st_C)+0],ecx
+ mov [esp+X_stk+8*(st_C)+4],edx
+ endif
+ endif
+ if _SKEIN_DEBUG and (0 ne (_debug_ + 0))
+ Skein_Debug_Round BLK_SIZE,%(RotNum0+1)
+ endif
+endm ;MixStep
+;
+;;;;;;;;;;;;;;;;;
+;
+; MACRO: key schedule injection
+;
+ks_Inject macro BLK_SIZE,X_load,X_stor,rLo,rHi,rndBase,keyIdx,twkIdx,ROUND_ADD
+ ;are rLo,rHi values already loaded? if not, load them now
+ ifnb <X_load>
+ mov rLo,[esp+X_stk +8*(X_load) ]
+ mov rHi,[esp+X_stk +8*(X_load)+4]
+ endif
+
+ ;inject the 64-bit key schedule value (and maybe the tweak as well)
+if SKEIN_ASM_UNROLL and BLK_SIZE
+_kOffs_ = ((rndBase)+(keyIdx)) mod ((BLK_SIZE/64)+1)
+ add rLo,[FP_+ksKey+8*_kOffs_+ 0]
+ adc rHi,[FP_+ksKey+8*_kOffs_+ 4]
+ ifnb <twkIdx>
+_tOffs_ = ((rndBase)+(twkIdx)) mod 3
+ add rLo,[FP_+ksTwk+8*_tOffs_+ 0]
+ adc rHi,[FP_+ksTwk+8*_tOffs_+ 4]
+ endif
+ ifnb <ROUND_ADD>
+ add rLo,(ROUND_ADD)
+ adc rHi,0
+ endif
+else
+ add rLo,[FP_+ksKey+8*(keyIdx)+8*edi ]
+ adc rHi,[FP_+ksKey+8*(keyIdx)+8*edi+4]
+ ifnb <twkIdx>
+ add rLo,[FP_+ksTwk+8*(twkIdx)+8*edi ]
+ adc rHi,[FP_+ksTwk+8*(twkIdx)+8*edi+4]
+ endif
+ ifnb <ROUND_ADD>
+ add rLo,edi ;edi is the round number
+ adc rHi,0
+ endif
+endif
+
+ ;do we need to store updated rLo,rHi values? if so, do it now
+ ifnb <X_stor>
+ mov [esp+X_stk +8*(X_stor) ],rLo
+ mov [esp+X_stk +8*(X_stor)+4],rHi
+ endif
+endm ;ks_Inject
+;
+;----------------------------------------------------------------
+; MACRO: key schedule rotation
+;
+ks_Rotate macro rLo,rHi,WCNT
+ mov rLo,[FP_+ksKey+8*edi+ 0] ;"rotate" the key schedule in memory
+ mov rHi,[FP_+ksKey+8*edi+ 4]
+ mov [FP_+ksKey+8*edi+8*(WCNT+1)+ 0],rLo
+ mov [FP_+ksKey+8*edi+8*(WCNT+1)+ 4],rHi
+ mov rLo,[FP_+ksTwk+8*edi+ 0]
+ mov rHi,[FP_+ksTwk+8*edi+ 4]
+ mov [FP_+ksTwk+8*edi+8*3+ 0],rLo
+ mov [FP_+ksTwk+8*edi+8*3+ 4],rHi
+endm
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 256
+ public _Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; MACRO: two rounds
+;
+R_256_TwoRounds macro _RR_,ld_0
+ ; here with edx:ecx = X[1]
+ ;--------- round _RR_
+ MixStep 256,ld_0, ,0,1,((_RR_)+0),0
+ MixStep 256, 2,3,2,3,((_RR_)+0),1,1
+
+ ; here with edx:ecx = X[3]
+ ;--------- round _RR_ + 1
+ MixStep 256, 0, ,0,3,((_RR_)+1),0
+ MixStep 256, 2,1,2,1,((_RR_)+1),1,1
+
+ ; here with edx:ecx = X[1]
+endm ;R_256_TwoRounds
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+_Skein_256_Process_Block proc near
+ WCNT = 4 ;WCNT=4 for Skein-256
+ Setup_Stack WCNT,(ROUNDS_256/8)
+
+ ; main hash loop for Skein_256
+Skein_256_block_loop:
+ mov eax,[edi+TWEAK+ 0] ;ebx:eax = tweak word T0
+ mov ebx,[edi+TWEAK+ 4]
+ mov ecx,[edi+TWEAK+ 8] ;edx:ecx = tweak word T1
+ mov edx,[edi+TWEAK+12]
+
+ add eax,[FP_+bitAdd ] ;bump T0 by the bitAdd parameter
+ adc ebx, 0
+ mov [edi+TWEAK ],eax ;save updated tweak value T0
+ mov [edi+TWEAK+ 4],ebx
+
+ mov [FP_+ksTwk ],eax ;build the tweak schedule on the stack
+ mov [FP_+ksTwk+ 4],ebx
+ xor eax,ecx ;ebx:eax = T0 ^ T1
+ xor ebx,edx
+ mov [FP_+ksTwk+ 8],ecx
+ mov [FP_+ksTwk+12],edx
+ mov [FP_+ksTwk+16],eax
+ mov [FP_+ksTwk+20],ebx
+
+ mov eax,KW_PARITY_LO ;init parity accumulator
+ mov ebx,KW_PARITY_HI
+;
+_NN_ = 0
+ rept WCNT ;copy in the chaining vars
+ mov ecx,[edi+X_VARS+_NN_ ]
+ mov edx,[edi+X_VARS+_NN_+ 4]
+ xor eax,ecx ;compute overall parity along the way
+ xor ebx,edx
+ mov [FP_+ksKey +_NN_ ],ecx
+ mov [FP_+ksKey +_NN_+ 4],edx
+_NN_ = _NN_+8
+ endm
+;
+ mov [FP_+ksKey +_NN_ ],eax ;save overall parity at the end of the array
+ mov [FP_+ksKey +_NN_+ 4],ebx
+
+ mov esi,[FP_+blkPtr ] ;esi --> input block
+;
+_NN_ = WCNT*8-16 ;work down from the end
+ rept WCNT/2 ;perform initial key injection
+ mov eax,[esi+_NN_ + 0]
+ mov ebx,[esi+_NN_ + 4]
+ mov ecx,[esi+_NN_ + 8]
+ mov edx,[esi+_NN_ +12]
+ mov [esp+_NN_+Wcopy + 0],eax
+ mov [esp+_NN_+Wcopy + 4],ebx
+ mov [esp+_NN_+Wcopy + 8],ecx
+ mov [esp+_NN_+Wcopy +12],edx
+ add eax,[FP_+_NN_+ksKey + 0]
+ adc ebx,[FP_+_NN_+ksKey + 4]
+ add ecx,[FP_+_NN_+ksKey + 8]
+ adc edx,[FP_+_NN_+ksKey +12]
+ if _NN_ eq (WCNT*8-16) ;inject the tweak words
+ add eax,[FP_+ ksTwk + 8]; (at the appropriate points)
+ adc ebx,[FP_+ ksTwk +12]
+ elseif _NN_ eq (WCNT*8-32)
+ add ecx,[FP_+ ksTwk + 0]
+ adc edx,[FP_+ ksTwk + 4]
+ endif
+ if _NN_ or _SKEIN_DEBUG
+ mov [esp+_NN_+X_stk + 0],eax
+ mov [esp+_NN_+X_stk + 4],ebx
+ mov [esp+_NN_+X_stk + 8],ecx
+ mov [esp+_NN_+X_stk +12],edx
+ endif
+_NN_ = _NN_ - 16 ;end at X[0], so regs are already loaded for first MIX!
+ endm
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block WCNT*64
+ Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL
+endif
+ add esi, WCNT*8 ;skip the block
+ mov [FP_+blkPtr ],esi ;update block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT = ROUNDS_256/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_256 ;unroll count
+ if ((ROUNDS_256/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_256"
+ endif
+ xor edi,edi ;edi = iteration count
+Skein_256_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ ; here with X[0], X[1] already loaded into eax..edx
+ R_256_TwoRounds %(4*_Rbase_+00),
+ R_256_TwoRounds %(4*_Rbase_+02),0
+
+ ;inject key schedule
+ if _UNROLL_CNT ne (ROUNDS_256/8)
+ ks_Rotate eax,ebx,WCNT
+ inc edi ;edi = round number
+ endif
+_Rbase_ = _Rbase_+1
+ ks_Inject 256,3,3,eax,ebx,_Rbase_,3, ,_Rbase_
+ ks_Inject 256,2,2,eax,ebx,_Rbase_,2,1
+ ks_Inject 256, , ,ecx,edx,_Rbase_,1,0
+ ks_Inject 256,0, ,eax,ebx,_Rbase_,0
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs
+ endif
+endm ;rept _UNROLL_CNT
+;
+ if _UNROLL_CNT ne (ROUNDS_256/8)
+ cmp edi,2*(ROUNDS_256/8)
+ jb Skein_256_round_loop
+ mov edi,[FP_+ctxPtr ] ;restore edi --> context
+ endif
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+_NN_ = 0
+ rept WCNT/2
+ if _NN_ ;eax..edx already loaded the first time
+ mov eax,[esp+X_stk + _NN_ + 0]
+ mov ebx,[esp+X_stk + _NN_ + 4]
+ mov ecx,[esp+X_stk + _NN_ + 8]
+ mov edx,[esp+X_stk + _NN_ +12]
+ endif
+ if _NN_ eq 0
+ and dword ptr [edi +TWEAK +12],FIRST_MASK
+ endif
+ xor eax,[esp+Wcopy + _NN_ + 0]
+ xor ebx,[esp+Wcopy + _NN_ + 4]
+ xor ecx,[esp+Wcopy + _NN_ + 8]
+ xor edx,[esp+Wcopy + _NN_ +12]
+ mov [edi+X_VARS+ _NN_ + 0],eax
+ mov [edi+X_VARS+ _NN_ + 4],ebx
+ mov [edi+X_VARS+ _NN_ + 8],ecx
+ mov [edi+X_VARS+ _NN_ +12],edx
+_NN_ = _NN_+16
+ endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
+endif
+ ; go back for more blocks, if needed
+ dec dword ptr [FP_+blkCnt]
+ jnz Skein_256_block_loop
+
+ Reset_Stack _Skein_256_Process_Block
+ ret
+_Skein_256_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein_256_Process_Block_CodeSize
+_Skein_256_Process_Block_CodeSize proc
+ mov eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block
+ ret
+_Skein_256_Process_Block_CodeSize endp
+;
+ public _Skein_256_Unroll_Cnt
+_Skein_256_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_256/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein_256_Unroll_Cnt endp
+endif
+endif ;_USE_ASM_ and 256
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 512
+ public _Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_512_FourRounds macro _RR_,ld_0
+ ; here with edx:ecx = X[1]
+ ;--------- round _RR_
+ ; R512(0,1,2,3,4,5,6,7,R_0, 1);
+ MixStep 512, ld_0, ,0,1,((_RR_)+0),0
+ MixStep 512, 2,3,2,3,((_RR_)+0),1
+ MixStep 512, 4,5,4,5,((_RR_)+0),2
+ MixStep 512, 6,7,6, ,((_RR_)+0),3,1
+
+ ; here with edx:ecx = X[7]
+ ; R512(2,1,4,7,6,5,0,3,R_1, 2);
+ MixStep 512, 4, ,4,7,((_RR_)+1),1
+ MixStep 512, 6,5,6,5,((_RR_)+1),2
+ MixStep 512, 0,3,0,3,((_RR_)+1),3
+ MixStep 512, 2,1,2, ,((_RR_)+1),0,1
+
+ ; here with edx:ecx = X[1]
+ ; R512(4,1,6,3,0,5,2,7,R_2, 3);
+ MixStep 512, 4, ,4,1,((_RR_)+2),0
+ MixStep 512, 6,3,6,3,((_RR_)+2),1
+ MixStep 512, 0,5,0,5,((_RR_)+2),2
+ MixStep 512, 2,7,2, ,((_RR_)+2),3,1
+
+ ; here with edx:ecx = X[7]
+ ; R512(6,1,0,7,2,5,4,3,R_3, 4);
+ MixStep 512, 0, ,0,7,((_RR_)+3),1
+ MixStep 512, 2,5,2,5,((_RR_)+3),2
+ MixStep 512, 4,3,4,3,((_RR_)+3),3
+ MixStep 512, 6,1,6, ,((_RR_)+3),0,1
+
+endm ;R_512_FourRounds
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein_512_Process_Block proc near
+ WCNT = 8 ;WCNT=8 for Skein-512
+ Setup_Stack WCNT,(ROUNDS_512/8)
+
+ ; main hash loop for Skein_512
+Skein_512_block_loop:
+ mov eax,[edi+TWEAK+ 0] ;ebx:eax = tweak word T0
+ mov ebx,[edi+TWEAK+ 4]
+ mov ecx,[edi+TWEAK+ 8] ;edx:ecx = tweak word T1
+ mov edx,[edi+TWEAK+12]
+
+ add eax,[FP_+bitAdd ] ;bump T0 by the bitAdd parameter
+ adc ebx, 0
+ mov [edi+TWEAK ],eax ;save updated tweak value T0
+ mov [edi+TWEAK+ 4],ebx
+
+ mov [FP_+ksTwk ],eax ;build the tweak schedule on the stack
+ mov [FP_+ksTwk+ 4],ebx
+ xor eax,ecx ;ebx:eax = T0 ^ T1
+ xor ebx,edx
+ mov [FP_+ksTwk+ 8],ecx
+ mov [FP_+ksTwk+12],edx
+ mov [FP_+ksTwk+16],eax
+ mov [FP_+ksTwk+20],ebx
+
+ mov eax,KW_PARITY_LO ;init parity accumulator
+ mov ebx,KW_PARITY_HI
+;
+_NN_ = 0
+ rept WCNT ;copy in the chaining vars
+ mov ecx,[edi+X_VARS+_NN_ ]
+ mov edx,[edi+X_VARS+_NN_+ 4]
+ xor eax,ecx ;compute overall parity along the way
+ xor ebx,edx
+ mov [FP_+ksKey +_NN_ ],ecx
+ mov [FP_+ksKey +_NN_+ 4],edx
+_NN_ = _NN_+8
+ endm
+;
+ mov [FP_+ksKey +_NN_ ],eax ;save overall parity at the end of the array
+ mov [FP_+ksKey +_NN_+ 4],ebx
+
+ mov esi,[FP_+blkPtr ] ;esi --> input block
+;
+_NN_ = WCNT*8-16 ;work down from the end
+ rept WCNT/2 ;perform initial key injection
+ mov eax,[esi+_NN_ + 0]
+ mov ebx,[esi+_NN_ + 4]
+ mov ecx,[esi+_NN_ + 8]
+ mov edx,[esi+_NN_ +12]
+ mov [esp+_NN_+Wcopy + 0],eax
+ mov [esp+_NN_+Wcopy + 4],ebx
+ mov [esp+_NN_+Wcopy + 8],ecx
+ mov [esp+_NN_+Wcopy +12],edx
+ add eax,[FP_+_NN_+ksKey + 0]
+ adc ebx,[FP_+_NN_+ksKey + 4]
+ add ecx,[FP_+_NN_+ksKey + 8]
+ adc edx,[FP_+_NN_+ksKey +12]
+ if _NN_ eq (WCNT*8-16) ;inject the tweak words
+ add eax,[FP_+ ksTwk + 8]; (at the appropriate points)
+ adc ebx,[FP_+ ksTwk +12]
+ elseif _NN_ eq (WCNT*8-32)
+ add ecx,[FP_+ ksTwk + 0]
+ adc edx,[FP_+ ksTwk + 4]
+ endif
+ if _NN_ or _SKEIN_DEBUG
+ mov [esp+_NN_+X_stk + 0],eax
+ mov [esp+_NN_+X_stk + 4],ebx
+ mov [esp+_NN_+X_stk + 8],ecx
+ mov [esp+_NN_+X_stk +12],edx
+ endif
+_NN_ = _NN_ - 16 ;end at X[0], so regs are already loaded for first MIX!
+ endm
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block WCNT*64
+ Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL
+endif
+ add esi, WCNT*8 ;skip the block
+ mov [FP_+blkPtr ],esi ;update block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT = ROUNDS_512/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ if ((ROUNDS_512/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_512"
+ endif
+ xor edi,edi ;edi = round counter
+Skein_512_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ ; here with X[0], X[1] already loaded into eax..edx
+ R_512_FourRounds %(4*_Rbase_+00),
+
+ ;inject odd key schedule words
+ if _UNROLL_CNT ne (ROUNDS_512/8)
+ ks_Rotate eax,ebx,WCNT
+ inc edi ;edi = round number
+ endif
+_Rbase_ = _Rbase_+1
+ ks_Inject 512,7,7,eax,ebx,_Rbase_,7, ,_Rbase_
+ ks_Inject 512,6,6,eax,ebx,_Rbase_,6,1
+ ks_Inject 512,5,5,eax,ebx,_Rbase_,5,0
+ ks_Inject 512,4,4,eax,ebx,_Rbase_,4
+ ks_Inject 512,3,3,eax,ebx,_Rbase_,3
+ ks_Inject 512,2,2,eax,ebx,_Rbase_,2
+ ks_Inject 512, , ,ecx,edx,_Rbase_,1
+ ks_Inject 512,0, ,eax,ebx,_Rbase_,0
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT ,saveRegs
+ endif
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+ cmp edi,2*(ROUNDS_512/8)
+ jb Skein_512_round_loop
+ mov edi,[FP_+ctxPtr ] ;restore edi --> context
+endif
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+_NN_ = 0
+ rept WCNT/2
+ if _NN_ ;eax..edx already loaded the first time
+ mov eax,[esp+X_stk + _NN_ + 0]
+ mov ebx,[esp+X_stk + _NN_ + 4]
+ mov ecx,[esp+X_stk + _NN_ + 8]
+ mov edx,[esp+X_stk + _NN_ +12]
+ endif
+ if _NN_ eq 0
+ and dword ptr [edi + TWEAK+12],FIRST_MASK
+ endif
+ xor eax,[esp+Wcopy + _NN_ + 0]
+ xor ebx,[esp+Wcopy + _NN_ + 4]
+ xor ecx,[esp+Wcopy + _NN_ + 8]
+ xor edx,[esp+Wcopy + _NN_ +12]
+ mov [edi+X_VARS+ _NN_ + 0],eax
+ mov [edi+X_VARS+ _NN_ + 4],ebx
+ mov [edi+X_VARS+ _NN_ + 8],ecx
+ mov [edi+X_VARS+ _NN_ +12],edx
+_NN_ = _NN_+16
+ endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+endif
+ ; go back for more blocks, if needed
+ dec dword ptr [FP_+blkCnt]
+ jnz Skein_512_block_loop
+
+ Reset_Stack _Skein_512_Process_Block
+ ret
+_Skein_512_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein_512_Process_Block_CodeSize
+_Skein_512_Process_Block_CodeSize proc
+ mov eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block
+ ret
+_Skein_512_Process_Block_CodeSize endp
+;
+ public _Skein_512_Unroll_Cnt
+_Skein_512_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_512/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 1024
+ public _Skein1024_Process_Block
+;
+; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_1024_FourRounds macro _RR_,ld_0
+ ; here with edx:ecx = X[1]
+
+ ;--------- round _RR_
+ MixStep 1024, ld_0, , 0, 1,((_RR_)+0),0
+ MixStep 1024, 2, 3, 2, 3,((_RR_)+0),1
+ MixStep 1024, 4, 5, 4, 5,((_RR_)+0),2
+ MixStep 1024, 6, 7, 6, 7,((_RR_)+0),3
+ MixStep 1024, 8, 9, 8, 9,((_RR_)+0),4
+ MixStep 1024, 10,11,10,11,((_RR_)+0),5
+ MixStep 1024, 12,13,12,13,((_RR_)+0),6
+ MixStep 1024, 14,15,14, ,((_RR_)+0),7,1
+ ; here with edx:ecx = X[15]
+
+ ;--------- round _RR_+1
+ MixStep 1024, 4, , 4,15,((_RR_)+1),3
+ MixStep 1024, 0, 9, 0, 9,((_RR_)+1),0
+ MixStep 1024, 2,13, 2,13,((_RR_)+1),1
+ MixStep 1024, 6,11, 6,11,((_RR_)+1),2
+ MixStep 1024, 10, 7,10, 7,((_RR_)+1),4
+ MixStep 1024, 12, 3,12, 3,((_RR_)+1),5
+ MixStep 1024, 14, 5,14, 5,((_RR_)+1),6
+ MixStep 1024, 8, 1, 8, ,((_RR_)+1),7,1
+ ; here with edx:ecx = X[1]
+
+ ;--------- round _RR_+2
+ MixStep 1024, 6, , 6, 1,((_RR_)+2),3
+ MixStep 1024, 0, 7, 0, 7,((_RR_)+2),0
+ MixStep 1024, 2, 5, 2, 5,((_RR_)+2),1
+ MixStep 1024, 4, 3, 4, 3,((_RR_)+2),2
+ MixStep 1024, 12,15,12,15,((_RR_)+2),4
+ MixStep 1024, 14,13,14,13,((_RR_)+2),5
+ MixStep 1024, 8,11, 8,11,((_RR_)+2),6
+ MixStep 1024, 10, 9,10, ,((_RR_)+2),7,1
+ ; here with edx:ecx = X[9]
+
+ ;--------- round _RR_+3
+ MixStep 1024, 4, , 4, 9,((_RR_)+3),3
+ MixStep 1024, 0,15, 0,15,((_RR_)+3),0
+ MixStep 1024, 2,11, 2,11,((_RR_)+3),1
+ MixStep 1024, 6,13, 6,13,((_RR_)+3),2
+ MixStep 1024, 8, 5, 8, 5,((_RR_)+3),5
+ MixStep 1024, 10, 3,10, 3,((_RR_)+3),6
+ MixStep 1024, 12, 7,12, 7,((_RR_)+3),7
+ MixStep 1024, 14, 1,14, ,((_RR_)+3),4,1
+
+ ; here with edx:ecx = X[1]
+endm ;R_1024_FourRounds
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein1024_Process_Block proc near
+;
+ WCNT = 16 ;WCNT=16 for Skein-1024
+ Setup_Stack WCNT,(ROUNDS_1024/8)
+
+ ; main hash loop for Skein1024
+Skein1024_block_loop:
+ mov eax,[edi+TWEAK+ 0] ;ebx:eax = tweak word T0
+ mov ebx,[edi+TWEAK+ 4]
+ mov ecx,[edi+TWEAK+ 8] ;edx:ecx = tweak word T1
+ mov edx,[edi+TWEAK+12]
+
+ add eax,[FP_+bitAdd ] ;bump T0 by the bitAdd parameter
+ adc ebx, 0
+ mov [edi+TWEAK ],eax ;save updated tweak value T0
+ mov [edi+TWEAK+ 4],ebx
+
+ mov [FP_+ksTwk ],eax ;build the tweak schedule on the stack
+ mov [FP_+ksTwk+ 4],ebx
+ xor eax,ecx ;ebx:eax = T0 ^ T1
+ xor ebx,edx
+ mov [FP_+ksTwk+ 8],ecx
+ mov [FP_+ksTwk+12],edx
+ mov [FP_+ksTwk+16],eax
+ mov [FP_+ksTwk+20],ebx
+
+ mov eax,KW_PARITY_LO ;init parity accumulator
+ mov ebx,KW_PARITY_HI
+EDI_BIAS equ 70h ;bias the edi offsets to make them short!
+ add edi, EDI_BIAS
+CT_ equ <edi-EDI_BIAS>
+;
+_NN_ = 0
+ rept WCNT ;copy in the chaining vars
+ mov ecx,[CT_+X_VARS+_NN_ ]
+ mov edx,[CT_+X_VARS+_NN_+ 4]
+ xor eax,ecx ;compute overall parity along the way
+ xor ebx,edx
+ mov [FP_+ksKey +_NN_ ],ecx
+ mov [FP_+ksKey +_NN_+ 4],edx
+_NN_ = _NN_+8
+ endm
+;
+ mov [FP_+ksKey +_NN_ ],eax ;save overall parity at the end of the array
+ mov [FP_+ksKey +_NN_+ 4],ebx
+
+ mov esi,[FP_+blkPtr ] ;esi --> input block
+ lea edi,[esp+Wcopy]
+;
+_NN_ = WCNT*8-16 ;work down from the end
+ rept WCNT/2 ;perform initial key injection
+ mov eax,[esi+_NN_ + 0]
+ mov ebx,[esi+_NN_ + 4]
+ mov ecx,[esi+_NN_ + 8]
+ mov edx,[esi+_NN_ +12]
+ mov [edi+_NN_+ + 0],eax
+ mov [edi+_NN_+ + 4],ebx
+ mov [edi+_NN_+ + 8],ecx
+ mov [edi+_NN_+ +12],edx
+ add eax,[FP_+_NN_+ksKey + 0]
+ adc ebx,[FP_+_NN_+ksKey + 4]
+ add ecx,[FP_+_NN_+ksKey + 8]
+ adc edx,[FP_+_NN_+ksKey +12]
+ if _NN_ eq (WCNT*8-16) ;inject the tweak words
+ add eax,[FP_+ ksTwk + 8]; (at the appropriate points)
+ adc ebx,[FP_+ ksTwk +12]
+ elseif _NN_ eq (WCNT*8-32)
+ add ecx,[FP_+ ksTwk + 0]
+ adc edx,[FP_+ ksTwk + 4]
+ endif
+ if _NN_ or _SKEIN_DEBUG
+ mov [esp+_NN_+X_stk + 0],eax
+ mov [esp+_NN_+X_stk + 4],ebx
+ mov [esp+_NN_+X_stk + 8],ecx
+ mov [esp+_NN_+X_stk +12],edx
+ endif
+_NN_ = _NN_ - 16 ;end at X[0], so regs are already loaded for first MIX!
+ endm
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block WCNT*64
+ Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL
+endif
+ sub esi,-WCNT*8 ;skip the block (short immediate)
+ mov [FP_+blkPtr ],esi ;update block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT = ROUNDS_1024/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_1024"
+ endif
+ xor edi,edi ;edi = round counter
+Skein_1024_round_loop:
+endif
+
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ ; here with X[0], X[1] already loaded into eax..edx
+ R_1024_FourRounds %(4*_Rbase_+00),
+
+ ;inject odd key schedule words
+ ;inject odd key schedule words
+ if _UNROLL_CNT ne (ROUNDS_1024/8)
+ ks_Rotate eax,ebx,WCNT
+ inc edi ;edi = round number
+ endif
+_Rbase_ = _Rbase_+1
+ ks_Inject 1024,15,15,eax,ebx,_Rbase_,15, ,_Rbase_
+ ks_Inject 1024,14,14,eax,ebx,_Rbase_,14,1
+ ks_Inject 1024,13,13,eax,ebx,_Rbase_,13,0
+ irp _w,<12,11,10,9,8,7,6,5,4,3,2>
+ ks_Inject 1024,_w,_w,eax,ebx,_Rbase_,_w
+ endm
+ ks_Inject 1024, , ,ecx,edx,_Rbase_,1
+ ks_Inject 1024, 0, ,eax,ebx,_Rbase_,0
+
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs
+ endif
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+ cmp edi,2*(ROUNDS_1024/8)
+ jb Skein_1024_round_loop
+endif
+ mov edi,[FP_+ctxPtr ] ;restore edi --> context
+ add edi,EDI_BIAS ;and bias it for short offsets below
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ lea esi,[esp+Wcopy] ;use short offsets below
+_NN_ = 0
+ rept WCNT/2
+ if _NN_ ;eax..edx already loaded the first time
+ mov eax,[esp+X_stk + _NN_ + 0]
+ mov ebx,[esp+X_stk + _NN_ + 4]
+ mov ecx,[esp+X_stk + _NN_ + 8]
+ mov edx,[esp+X_stk + _NN_ +12]
+ endif
+ if _NN_ eq 0
+ and dword ptr [CT_ + TWEAK+12],FIRST_MASK
+ endif
+ xor eax,[esi + _NN_ + 0]
+ xor ebx,[esi + _NN_ + 4]
+ xor ecx,[esi + _NN_ + 8]
+ xor edx,[esi + _NN_ +12]
+ mov [CT_+X_VARS+ _NN_ + 0],eax
+ mov [CT_+X_VARS+ _NN_ + 4],ebx
+ mov [CT_+X_VARS+ _NN_ + 8],ecx
+ mov [CT_+X_VARS+ _NN_ +12],edx
+_NN_ = _NN_+16
+ endm
+ sub edi,EDI_BIAS ;undo the bias for return
+
+if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD
+endif
+ ; go back for more blocks, if needed
+ dec dword ptr [FP_+blkCnt]
+ jnz Skein1024_block_loop
+
+ Reset_Stack _Skein1024_Process_Block
+ ret
+_Skein1024_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein1024_Process_Block_CodeSize
+_Skein1024_Process_Block_CodeSize proc
+ mov eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block
+ ret
+_Skein1024_Process_Block_CodeSize endp
+;
+ public _Skein1024_Unroll_Cnt
+_Skein1024_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_1024/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;----------------------------------------------------------------
+ end
diff --git a/Additional_Implementations/skein_block_xmm32.asm b/Additional_Implementations/skein_block_xmm32.asm
new file mode 100644
index 000000000000..96ef121cd49a
--- /dev/null
+++ b/Additional_Implementations/skein_block_xmm32.asm
@@ -0,0 +1,1167 @@
+;
+;----------------------------------------------------------------
+; 32-bit x86 assembler code for Skein block functions using XMM registers
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+ .386p
+ .model flat
+ .code
+ .xmm ;enable XMM instructions
+;
+_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_ = SKEIN_USE_ASM
+else
+_USE_ASM_ = _MASK_ALL_
+endif
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP
+_SKEIN_LOOP = 0 ;default is all fully unrolled
+else
+_SKEIN_LOOP = SKEIN_LOOP
+endif
+;--------------
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10
+;
+SKEIN_ASM_UNROLL = 0
+ irp _NN_,<256,512,1024>
+ if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_
+ endif
+ endm
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5)
+endif
+irp _NN_,<256,512,1024>
+ if _USE_ASM_ and _NN_
+ irp _RR_,<%(ROUNDS_&_NN_)>
+ if _NN_ eq 1024
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ else
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ endif
+ endm
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE equ (1)
+else
+ifdef SKEIN_PERF ;use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+else
+_SKEIN_DEBUG = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS = 0 ;# bits of hash output
+BCNT = 4 + HASH_BITS ;number of bytes in BUFFER[]
+TWEAK = 4 + BCNT ;tweak values[0..1]
+X_VARS = 16 + TWEAK ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+KW_PARITY_LO= 0A9FC1A22h ;overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI= 01BD11BDAh
+FIRST_MASK8 = NOT (1 SHL 6) ;FIRST block flag bit
+;
+; rotation constants for Skein
+;
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+mov64 macro x0,x1
+ movq x0,x1
+endm
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar macro localName,localSize
+localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro WCNT,RND_CNT
+_STK_OFFS_ = 0 ;starting offset from esp, forced on 16-byte alignment
+ ;----- local variables ;<-- esp
+ StackVar X_stk , 8*(WCNT) ;local context vars
+ StackVar Wcopy , 8*(WCNT) ;copy of input block
+ StackVar ksTwk ,16*3 ;key schedule: tweak words
+ StackVar ksKey ,16*(WCNT)+16;key schedule: key words
+FRAME_OFFS = ksTwk+128 ;<-- ebp
+ if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0
+ StackVar ksRot,16*(RND_CNT/4);leave space for ks "rotation" to happen
+ endif
+LOCAL_SIZE = _STK_OFFS_ ;size of local vars
+ ;
+ ;"restart" the stack defns, because we relocate esp to guarantee alignment
+ ; (i.e., these vars are NOT at fixed offsets from esp)
+_STK_OFFS_ = 0
+ ;-----
+ StackVar savRegs,8*4 ;pushad data
+ StackVar retAddr,4 ;return address
+ ;----- caller parameters
+ StackVar ctxPtr ,4 ;context ptr
+ StackVar blkPtr ,4 ;pointer to block data
+ StackVar blkCnt ,4 ;number of full blocks to process
+ StackVar bitAdd ,4 ;bit count to add to tweak
+ ;----- caller's stack frame
+;
+; Notes on stack frame setup:
+; * the most used variable (except for Skein-256) is X_stk[], based at [esp+0]
+; * the next most used is the key schedule words
+; so ebp is "centered" there, allowing short offsets to the key/tweak
+; schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-(
+; * the Wcopy variables are infrequently accessed, and they have long
+; offsets from both esp and ebp only in the 1024-bit case.
+; * all other local vars and calling parameters can be accessed
+; with short offsets, except in the 1024-bit case
+;
+ pushad ;save all regs
+ mov ebx,esp ;keep ebx as pointer to caller parms
+ sub esp,LOCAL_SIZE ;make room for the locals
+ and esp,not 15 ;force alignment
+ mov edi,[ebx+ctxPtr ] ;edi --> Skein context
+ lea ebp,[esp+FRAME_OFFS] ;maximize use of short offsets from ebp
+ mov ecx,ptr32 [ebx+blkCnt] ;keep block cnt in ecx
+;
+endm ;Setup_Stack
+;
+FP_ equ <ebp-FRAME_OFFS> ;keep as many short offsets as possible
+SI_ equ <esi-FRAME_OFFS> ;keep as many short offsets as possible
+ptr64 equ <qword ptr> ;useful abbreviations
+ptr32 equ <dword ptr>
+ptr08 equ <byte ptr>
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro procStart
+ mov esp,ebx ;get rid of locals (wipe??)
+ popad ;restore all regs
+
+ ;display code size in bytes to stdout
+ irp _BCNT_,<%($+1-procStart)> ;account for return opcode
+if _BCNT_ ge 10000 ;(align it all pretty)
+%out procStart code size = _BCNT_ bytes
+elseif _BCNT_ ge 1000
+%out procStart code size = _BCNT_ bytes
+else
+%out procStart code size = _BCNT_ bytes
+endif
+ endm ;irp _BCNT_
+
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+ extrn _Skein_Show_Block:near ;calls to C routines
+ extrn _Skein_Show_Round:near
+;
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+; const u08b_t *blkPtr, const u64b_t *wPtr,
+; const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+ Put_XMM_&BLK_BITS
+ pushad ;save all regs
+ lea eax,[FP_+ksTwk+1] ;+1 = flag: "stride" size = 2 qwords
+ lea esi,[FP_+ksKey+1]
+ lea ecx,[esp+32+Wcopy] ;adjust offset by 32 for pushad
+ mov edx,[ebx+ctxPtr] ;ctx_hdr_ptr
+ lea edx,[edx+X_VARS] ;edx ==> cxt->X[]
+ push eax ;tsPtr
+ push esi ;ksPtr
+ push ecx ;wPtr
+ push ptr32 [ebx+blkPtr] ;blkPtr
+ push edx ;ctx->Xptr
+ push ptr32 [ebx+ctxPtr] ;ctx_hdr_ptr
+ mov eax,BLK_BITS
+ push eax ;bits
+ ifdef _MINGW_
+ call _Skein_Show_Block-4 ;strange linkage??
+ else
+ call _Skein_Show_Block
+ endif
+ add esp,7*4 ;discard parameter space on stack
+ popad ;restore regs
+;
+ Get_XMM_&BLK_BITS
+endm ;Skein_Debug_Block
+
+;
+Skein_Debug_Round macro BLK_BITS,R,saveRegs
+;
+;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X);
+;
+ ifnb <saveRegs>
+ Put_XMM_&BLK_BITS
+ endif
+ pushad ;save all regs
+ if R ne SKEIN_RND_FEED_FWD
+ lea eax,[esp+32+X_stk] ;adjust offset by 32 for pushad
+ else
+ mov eax,[ebx+ctxPtr]
+ add eax,X_VARS
+ endif
+ push eax ;Xptr
+ if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+ mov eax,R
+ else
+ lea eax,[4*edx+1+(((R)-1) and 3)] ;compute round number using edx
+ endif
+ push eax ;round number
+ push ptr32 [ebx+ctxPtr] ;ctx_hdr_ptr
+ mov eax,BLK_BITS
+ push eax ;bits
+ ifdef _MINGW_
+ call _Skein_Show_Round-4 ;strange linkage??
+ else
+ call _Skein_Show_Round
+ endif
+ add esp,4*4 ;discard parameter space on stack
+ popad ;restore regs
+
+ ifnb <saveRegs>
+ Get_XMM_&BLK_BITS ;save internal vars for debug dump
+ endif
+endm ;Skein_Debug_Round
+endif ;ifdef SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+; useful macros
+_ldX macro xn
+ ifnb <xn>
+ mov64 xmm&xn,ptr64 [esp+X_stk+8*xn]
+ endif
+endm
+
+_stX macro xn
+ ifnb <xn>
+ mov64 ptr64 [esp+X_stk+8*xn],xmm&xn
+ endif
+endm
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 256
+ public _Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; Skein-256 round macros
+;
+R_256_OneRound macro _RR_,x0,x1,x2,x3,t0,t1
+ irp _qq_,<%((_RR_) and 7)> ;figure out which rotation constants to use
+ if x0 eq 0
+_RC0_ = RC_256_&_qq_&_0
+_RC1_ = RC_256_&_qq_&_1
+ else
+_RC0_ = RC_256_&_qq_&_1
+_RC1_ = RC_256_&_qq_&_0
+ endif
+ endm
+;
+ paddq xmm&x0,xmm&x1
+ mov64 xmm&t0,xmm&x1
+ psllq xmm&x1, _RC0_
+ psrlq xmm&t0,64-_RC0_
+ xorpd xmm&x1,xmm&x0
+ xorpd xmm&x1,xmm&t0
+;
+ paddq xmm&x2,xmm&x3
+ mov64 xmm&t1,xmm&x3
+ psllq xmm&x3, _RC1_
+ psrlq xmm&t1,64-_RC1_
+ xorpd xmm&x3,xmm&x2
+ xorpd xmm&x3,xmm&t1
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 256,%(_RR_+1),saveRegs
+ endif
+endm ;R_256_OneRound
+;
+R_256_FourRounds macro _RN_
+ R_256_OneRound (_RN_+0),0,1,2,3,4,5
+ R_256_OneRound (_RN_+1),2,1,0,3,4,5
+
+ R_256_OneRound (_RN_+2),0,1,2,3,4,5
+ R_256_OneRound (_RN_+3),2,1,0,3,4,5
+
+ ;inject key schedule
+ inc edx ;bump round number
+ movd xmm4,edx
+ if _UNROLL_CNT eq (ROUNDS_256/8)
+ ;fully unrolled version
+_RK_ = ((_RN_)/4) ;key injection counter
+ paddq xmm0,[FP_+ksKey+16*((_RK_+1) mod 5)]
+ paddq xmm1,[FP_+ksKey+16*((_RK_+2) mod 5)]
+ paddq xmm2,[FP_+ksKey+16*((_RK_+3) mod 5)]
+ paddq xmm3,[FP_+ksKey+16*((_RK_+4) mod 5)]
+ paddq xmm1,[FP_+ksTwk+16*((_RK_+1) mod 3)]
+ paddq xmm2,[FP_+ksTwk+16*((_RK_+2) mod 3)]
+ paddq xmm3,xmm4
+ else ;looping version
+ paddq xmm0,[SI_+ksKey+16*1]
+ paddq xmm1,[SI_+ksKey+16*2]
+ paddq xmm2,[SI_+ksKey+16*3]
+ paddq xmm3,[SI_+ksKey+16*4]
+ paddq xmm1,[SI_+ksTwk+16*1]
+ paddq xmm2,[SI_+ksTwk+16*2]
+ paddq xmm3,xmm4
+;
+ mov64 xmm4,<ptr64 [SI_+ksKey]>;first, "rotate" key schedule on the stack
+ mov64 xmm5,<ptr64 [SI_+ksTwk]>; (for next time through)
+ mov64 <ptr64 [SI_+ksKey+16*(WCNT+1)]>,xmm4
+ mov64 <ptr64 [SI_+ksTwk+16*3]>,xmm5
+ add esi,16 ;bump rolling pointer
+ endif
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs
+ endif
+endm ;R256_FourRounds
+;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_256 equ <call _Put_XMM_256>
+Get_XMM_256 equ <call _Get_XMM_256>
+
+_Put_XMM_256:
+ irp _NN_,<0,1,2,3>
+ mov64 ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_
+ endm
+ ret
+;
+_Get_XMM_256:
+ irp _NN_,<0,1,2,3>
+ mov64 xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8]
+ endm
+ ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+_Skein_256_Process_Block proc near
+ WCNT = 4 ;WCNT=4 for Skein-256
+ Setup_Stack WCNT,ROUNDS_256
+ ; main hash loop for Skein_256
+Skein_256_block_loop:
+ movd xmm4,ptr32 [ebx+bitAdd]
+ mov64 xmm5,ptr64 [edi+TWEAK+0]
+ mov64 xmm6,ptr64 [edi+TWEAK+8]
+ paddq xmm5,xmm4 ;bump T0 by the bitAdd parameter
+ mov64 ptr64 [edi+TWEAK],xmm5 ;save updated tweak value T0 (for next time)
+ movapd xmm7,xmm6
+ xorpd xmm7,xmm5 ;compute overall tweak parity
+ movdqa [FP_+ksTwk ],xmm5 ;save the expanded tweak schedule on the stack
+ movdqa [FP_+ksTwk+16],xmm6
+ movdqa [FP_+ksTwk+32],xmm7
+
+ mov esi,[ebx+blkPtr] ;esi --> input block
+ mov eax,KW_PARITY_LO ;init key schedule parity accumulator
+ mov edx,KW_PARITY_HI
+ movd xmm4,eax
+ movd xmm0,edx
+ unpcklps xmm4,xmm0 ;pack two 32-bit words into xmm4
+;
+ irp _NN_,<0,1,2,3> ;copy in the chaining vars
+ mov64 xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_]
+ xorpd xmm4,xmm&_NN_ ;update overall parity
+ movdqa [FP_+ksKey+16*_NN_],xmm&_NN_
+ endm
+ movdqa [FP_+ksKey+16*WCNT],xmm4;save overall parity at the end of the array
+;
+ paddq xmm1,xmm5 ;inject the initial tweak words
+ paddq xmm2,xmm6
+;
+ irp _NN_,<0,1,2,3> ;perform the initial key injection
+ mov64 xmm4,ptr64 [esi+8*_NN_] ;and save a copy of the input block on stack
+ mov64 ptr64 [esp+8*_NN_+Wcopy],xmm4
+ paddq xmm&_NN_,xmm4
+ endm
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block 256
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+ add esi, WCNT*8 ;skip to the next block
+ mov [ebx+blkPtr ],esi ;save the updated block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+ xor edx,edx ;edx = iteration count
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT = ROUNDS_256/8 ;fully unrolled
+else
+_UNROLL_CNT = SKEIN_UNROLL_256 ;partial unroll count
+ if ((ROUNDS_256/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_256" ;sanity check
+ endif
+ mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_256_round_loop: ; (since there's no 16* scaled address mode)
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2 ; here with X[0..3] in XMM0..XMM3
+ R_256_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT*2
+;
+ if _UNROLL_CNT ne (ROUNDS_256/8)
+ cmp edx,2*(ROUNDS_256/8)
+ jb Skein_256_round_loop
+ endif
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ irp _NN_,<0,1,2,3>
+ mov64 xmm4,ptr64 [esp+Wcopy+8*_NN_]
+ xorpd xmm&_NN_,xmm4
+ mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+ endm
+ and ptr08 [edi +TWEAK +15],FIRST_MASK8
+if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,saveRegs
+endif
+ ; go back for more blocks, if needed
+ dec ecx
+ jnz Skein_256_block_loop
+
+ Reset_Stack _Skein_256_Process_Block
+ ret
+;
+_Skein_256_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein_256_Process_Block_CodeSize
+_Skein_256_Process_Block_CodeSize proc
+ mov eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block
+ ret
+_Skein_256_Process_Block_CodeSize endp
+;
+ public _Skein_256_Unroll_Cnt
+_Skein_256_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_256/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein_256_Unroll_Cnt endp
+endif
+endif ;_USE_ASM_ and 256
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 512
+ public _Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round
+;
+R_512_Round macro _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd
+irp _nr_,<%((_RR_) and 7)>
+_Ra_ = RC_512_&_nr_&_&Ra
+_Rb_ = RC_512_&_nr_&_&Rb
+_Rc_ = RC_512_&_nr_&_&Rc
+_Rd_ = RC_512_&_nr_&_&Rd
+endm
+ paddq xmm&a0,xmm&a1
+ _stX c0
+ mov64 xmm&c0,xmm&a1
+ psllq xmm&a1, _Ra_
+ psrlq xmm&c0,64-_Ra_
+ xorpd xmm&a1,xmm&c0
+ xorpd xmm&a1,xmm&a0
+
+ paddq xmm&b0,xmm&b1
+ _stX a0
+ mov64 xmm&a0,xmm&b1
+ psllq xmm&b1, _Rb_
+ psrlq xmm&a0,64-_Rb_
+ xorpd xmm&b1,xmm&b0
+ _ldX c0
+ xorpd xmm&b1,xmm&a0
+
+ paddq xmm&c0,xmm&c1
+ mov64 xmm&a0,xmm&c1
+ psllq xmm&c1, _Rc_
+ psrlq xmm&a0,64-_Rc_
+ xorpd xmm&c1,xmm&c0
+ xorpd xmm&c1,xmm&a0
+
+ paddq xmm&d0,xmm&d1
+ mov64 xmm&a0,xmm&d1
+ psllq xmm&d1, _Rd_
+ psrlq xmm&a0,64-_Rd_
+ xorpd xmm&d1,xmm&a0
+ _ldX a0
+ xorpd xmm&d1,xmm&d0
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 512,%(_RR_+1),saveRegs
+ endif
+endm
+;
+; MACRO: four rounds
+R_512_FourRounds macro _RN_
+ R_512_Round (_RN_) , 0,1,0, 2,3,1, 4,5,2, 6,7,3
+ R_512_Round (_RN_)+1, 2,1,0, 4,7,1, 6,5,2, 0,3,3
+ R_512_Round (_RN_)+2, 4,1,0, 6,3,1, 0,5,2, 2,7,3
+ R_512_Round (_RN_)+3, 6,1,0, 0,7,1, 2,5,2, 4,3,3
+
+ ;inject key schedule
+ irp _NN_,<0,1,2,3,4,5,6,7>
+ if _UNROLL_CNT eq (ROUNDS_512/8)
+ paddq xmm&_NN_,[FP_+ksKey+16*((((_RN_)/4)+(_NN_)+1) mod 9)]
+ else
+ paddq xmm&_NN_,[SI_+ksKey+16*((_NN_)+1)]
+ endif
+ endm
+ _stX 0 ;free up a register
+ inc edx ;bump round counter
+ movd xmm0,edx ;inject the tweak
+ if _UNROLL_CNT eq (ROUNDS_512/8)
+ paddq xmm5,[FP_+ksTwk+16*(((_RN_)+1) mod 3)]
+ paddq xmm6,[FP_+ksTwk+16*(((_RN_)+2) mod 3)]
+ paddq xmm7,xmm0
+ else ;looping version
+ paddq xmm5,[SI_+ksTwk+16*1]
+ paddq xmm6,[SI_+ksTwk+16*2]
+ paddq xmm7,xmm0
+;
+ mov64 xmm0,<ptr64 [SI_+ksKey]>;first, "rotate" key schedule on the stack
+ mov64 <ptr64 [SI_+ksKey+16*(WCNT+1)]>,xmm0
+ mov64 xmm0,<ptr64 [SI_+ksTwk]>; (for next time through)
+ mov64 <ptr64 [SI_+ksTwk+16*3]>,xmm0
+ add esi,16 ;bump rolling pointer
+ endif
+ _ldX 0 ;restore X0
+ if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,saveRegs
+ endif
+endm ;R_512_FourRounds
+;;;;;;;;;;;;;;;;;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_512 equ <call _Put_XMM_512>
+Get_XMM_512 equ <call _Get_XMM_512>
+
+_Put_XMM_512:
+ irp _NN_,<0,1,2,3,4,5,6,7>
+ mov64 ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_
+ endm
+ ret
+;
+_Get_XMM_512:
+ irp _NN_,<0,1,2,3,4,5,6,7>
+ mov64 xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8]
+ endm
+ ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+; code
+;
+_Skein_512_Process_Block proc near
+ WCNT = 8 ;WCNT=8 for Skein-512
+ Setup_Stack WCNT,ROUNDS_512
+ ; main hash loop for Skein_512
+Skein_512_block_loop:
+ movd xmm0,ptr32 [ebx+bitAdd]
+ mov64 xmm1,ptr64 [edi+TWEAK+0]
+ mov64 xmm2,ptr64 [edi+TWEAK+8]
+ paddq xmm1,xmm0 ;bump T0 by the bitAdd parameter
+ mov64 ptr64 [edi+TWEAK],xmm1 ;save updated tweak value T0 (for next time)
+ mov64 xmm0,xmm2
+ xorpd xmm0,xmm1 ;compute overall tweak parity
+ movdqa [FP_+ksTwk ],xmm1 ;save the expanded tweak schedule on the stack
+ movdqa [FP_+ksTwk+16*1],xmm2
+ movdqa [FP_+ksTwk+16*2],xmm0
+
+ mov esi,[ebx+blkPtr] ;esi --> input block
+ mov eax,KW_PARITY_LO ;init key schedule parity accumulator
+ mov edx,KW_PARITY_HI
+ movd xmm0,eax
+ movd xmm7,edx
+ unpcklps xmm0,xmm7 ;pack two 32-bit words into xmm0
+;
+ irp _NN_,<7,6,5,4,3,2,1> ;copy in the chaining vars (skip #0 for now)
+ mov64 xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_]
+ xorpd xmm0,xmm&_NN_ ;update overall parity
+ movdqa [FP_+ksKey+16*_NN_],xmm&_NN_
+ if _NN_ eq 5
+ paddq xmm5,xmm1 ;inject the initial tweak words
+ paddq xmm6,xmm2 ; (before they get trashed in xmm1/2)
+ endif
+ endm
+ mov64 xmm4,ptr64 [edi+X_VARS] ;handle #0 now
+ xorpd xmm0,xmm4 ;update overall parity
+ movdqa [FP_+ksKey+16* 0 ],xmm4;save the key value in slot #0
+ movdqa [FP_+ksKey+16*WCNT],xmm0;save overall parity at the end of the array
+;
+ mov64 xmm0,xmm4
+ irp _NN_,<7,6,5, 4,3,2,1,0> ;perform the initial key injection (except #4)
+ mov64 xmm4,ptr64 [esi+ 8*_NN_];and save a copy of the input block on stack
+ mov64 ptr64 [esp+ 8*_NN_+Wcopy],xmm4
+ paddq xmm&_NN_,xmm4
+ endm
+ mov64 xmm4,ptr64 [esi+ 8*4] ;get input block word #4
+ mov64 ptr64 [esp+ 8*4+Wcopy],xmm4
+ paddq xmm4,[FP_+ksKey+16*4] ;inject the initial key
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block 512
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+ add esi, WCNT*8 ;skip to the next block
+ mov [ebx+blkPtr],esi ;save the updated block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+ xor edx,edx ;edx = round counter
+if SKEIN_ASM_UNROLL and 512
+_UNROLL_CNT = ROUNDS_512/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ if ((ROUNDS_512/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_512"
+ endif
+ mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_512_round_loop: ; (since there's no 16* scaled address mode)
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ R_512_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 512) eq 0
+ cmp edx,2*(ROUNDS_512/8)
+ jb Skein_512_round_loop
+endif
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+ and ptr08 [edi +TWEAK +15],FIRST_MASK8
+irp _NN_,<0,2,4,6> ;do the aligned ones first
+ xorpd xmm&_NN_,[esp+Wcopy+8*_NN_]
+ mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+endm
+irp _NN_,<1,3,5,7> ;now we have some register space available
+ mov64 xmm0,ptr64 [esp+Wcopy+8*_NN_]
+ xorpd xmm&_NN_,xmm0
+ mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_
+endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+endif
+ ; go back for more blocks, if needed
+ dec ecx
+ jnz Skein_512_block_loop
+
+ Reset_Stack _Skein_512_Process_Block
+ ret
+_Skein_512_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein_512_Process_Block_CodeSize
+_Skein_512_Process_Block_CodeSize proc
+ mov eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block
+ ret
+_Skein_512_Process_Block_CodeSize endp
+;
+ public _Skein_512_Unroll_Cnt
+_Skein_512_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_512/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein_512_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 512
+;
+;----------------------------------------------------------------
+;
+if _USE_ASM_ and 1024
+ public _Skein1024_Process_Block
+;
+; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+R_1024_REGS equ (5) ;keep this many block variables in registers
+;
+;;;;;;;;;;;;;;;;
+if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines
+Put_XMM_1024 equ <call _Put_XMM_1024>
+Get_XMM_1024 equ <call _Get_XMM_1024>
+
+_Put_XMM_1024:
+_NN_ = 0
+ rept R_1024_REGS
+ irp _rr_,<%(_NN_)>
+ mov64 ptr64 [esp+X_stk+4+8*_NN_],xmm&_rr_
+ endm
+_NN_ = _NN_+1
+ endm
+ ret
+;
+_Get_XMM_1024:
+_NN_ = 0
+ rept R_1024_REGS
+ irp _rr_,<%(_NN_)>
+ mov64 xmm&_rr_,ptr64 [esp+X_stk+4+8*_NN_]
+ endm
+_NN_ = _NN_+1
+ endm
+ ret
+endif
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one mix step
+MixStep_1024 macro x0,x1,rotIdx0,rotIdx1,_debug_
+_r0_ = x0 ;default, if already loaded
+_r1_ = x1
+ ; load the regs (if necessary)
+ if (x0 ge R_1024_REGS)
+_r0_ = 5
+ mov64 xmm5,ptr64 [esp+X_stk+8*(x0)]
+ endif
+ if (x1 ge R_1024_REGS)
+_r1_ = 6
+ mov64 xmm6,ptr64 [esp+X_stk+8*(x1)]
+ endif
+ ; do the mix
+ irp _rx_,<%((rotIdx0) and 7)>
+_Rc_ = RC_1024_&_rx_&_&rotIdx1 ;rotation constant
+ endm
+ irp _x0_,<%_r0_>
+ irp _x1_,<%_r1_>
+ paddq xmm&_x0_,xmm&_x1_
+ mov64 xmm7 ,xmm&_x1_
+ psllq xmm&_x1_, _Rc_
+ psrlq xmm7 ,64-_Rc_
+ xorpd xmm&_x1_,xmm&_x0_
+ xorpd xmm&_x1_,xmm7
+ endm
+ endm
+ ; save the regs (if necessary)
+ if (x0 ge R_1024_REGS)
+ mov64 ptr64 [esp+X_stk+8*(x0)],xmm5
+ endif
+ if (x1 ge R_1024_REGS)
+ mov64 ptr64 [esp+X_stk+8*(x1)],xmm6
+ endif
+ ; debug output
+ if _SKEIN_DEBUG and (0 ne (_debug_ + 0))
+ Skein_Debug_Round 1024,%((RotIdx0)+1),saveRegs
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+; MACRO: four rounds
+;
+R_1024_FourRounds macro _RR_
+ ;--------- round _RR_
+ MixStep_1024 0, 1,%((_RR_)+0),0
+ MixStep_1024 2, 3,%((_RR_)+0),1
+ MixStep_1024 4, 5,%((_RR_)+0),2
+ MixStep_1024 6, 7,%((_RR_)+0),3
+ MixStep_1024 8, 9,%((_RR_)+0),4
+ MixStep_1024 10,11,%((_RR_)+0),5
+ MixStep_1024 12,13,%((_RR_)+0),6
+ MixStep_1024 14,15,%((_RR_)+0),7,1
+ ;--------- round _RR_+1
+ MixStep_1024 0, 9,%((_RR_)+1),0
+ MixStep_1024 2,13,%((_RR_)+1),1
+ MixStep_1024 6,11,%((_RR_)+1),2
+ MixStep_1024 4,15,%((_RR_)+1),3
+ MixStep_1024 10, 7,%((_RR_)+1),4
+ MixStep_1024 12, 3,%((_RR_)+1),5
+ MixStep_1024 14, 5,%((_RR_)+1),6
+ MixStep_1024 8, 1,%((_RR_)+1),7,1
+ ;--------- round _RR_+2
+ MixStep_1024 0, 7,%((_RR_)+2),0
+ MixStep_1024 2, 5,%((_RR_)+2),1
+ MixStep_1024 4, 3,%((_RR_)+2),2
+ MixStep_1024 6, 1,%((_RR_)+2),3
+ MixStep_1024 12,15,%((_RR_)+2),4
+ MixStep_1024 14,13,%((_RR_)+2),5
+ MixStep_1024 8,11,%((_RR_)+2),6
+ MixStep_1024 10, 9,%((_RR_)+2),7,1
+ ;--------- round _RR_+3
+ MixStep_1024 0,15,%((_RR_)+3),0
+ MixStep_1024 2,11,%((_RR_)+3),1
+ MixStep_1024 6,13,%((_RR_)+3),2
+ MixStep_1024 4, 9,%((_RR_)+3),3
+ MixStep_1024 14, 1,%((_RR_)+3),4
+ MixStep_1024 8, 5,%((_RR_)+3),5
+ MixStep_1024 10, 3,%((_RR_)+3),6
+ MixStep_1024 12, 7,%((_RR_)+3),7,1
+
+ inc edx ;edx = round number
+ movd xmm7,edx
+ ;inject the key
+irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0>
+ if _UNROLL_CNT ne (ROUNDS_1024/8)
+ if _NN_ lt R_1024_REGS
+ paddq xmm&_NN_,ptr64 [SI_+ksKey+16*_NN_+16]
+ else
+ mov64 xmm6 ,ptr64 [esp+X_stk+ 8*_NN_]
+ if _NN_ eq 15
+ paddq xmm6,xmm7
+ elseif _NN_ eq 14
+ paddq xmm6,ptr64 [SI_+ksTwk+16*2]
+ elseif _NN_ eq 13
+ paddq xmm6,ptr64 [SI_+ksTwk+16*1]
+ endif
+ paddq xmm6 ,ptr64 [SI_+ksKey+16*_NN_+16]
+ mov64 ptr64 [esp+X_stk+ 8*_NN_],xmm6
+ endif
+ else
+ if _NN_ lt R_1024_REGS
+ paddq xmm&_NN_,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)]
+ else
+ mov64 xmm6,ptr64 [esp+X_stk+ 8*_NN_]
+ paddq xmm6,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)]
+ if _NN_ eq 15
+ paddq xmm6,xmm7
+ elseif _NN_ eq 14
+ paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+2) mod 3)]
+ elseif _NN_ eq 13
+ paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+1) mod 3)]
+ endif
+ mov64 ptr64 [esp+X_stk+ 8*_NN_],xmm6
+ endif
+ endif
+endm
+if _UNROLL_CNT ne (ROUNDS_1024/8) ;rotate the key schedule on the stack
+ mov64 xmm6,ptr64 [SI_+ksKey]
+ mov64 xmm7,ptr64 [SI_+ksTwk]
+ mov64 ptr64 [SI_+ksKey+16*(WCNT+1)],xmm6
+ mov64 ptr64 [SI_+ksTwk+16* 3 ],xmm7
+ add esi,16 ;bump rolling pointer
+endif
+if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs
+endif
+endm ;R_1024_FourRounds
+;;;;;;;;;;;;;;;;
+; code
+;
+_Skein1024_Process_Block proc near
+;
+ WCNT = 16 ;WCNT=16 for Skein-1024
+ Setup_Stack WCNT,ROUNDS_1024
+ add edi,80h ;bias the edi ctxt offsets to keep them all short
+ctx equ <edi-80h> ;offset alias
+ ; main hash loop for Skein1024
+Skein1024_block_loop:
+ movd xmm0,ptr32 [ebx+bitAdd]
+ mov64 xmm1,ptr64 [ctx+TWEAK+0]
+ mov64 xmm2,ptr64 [ctx+TWEAK+8]
+ paddq xmm1,xmm0 ;bump T0 by the bitAdd parameter
+ mov64 ptr64 [ctx+TWEAK],xmm1 ;save updated tweak value T0 (for next time)
+ mov64 xmm0,xmm2
+ xorpd xmm0,xmm1 ;compute overall tweak parity
+ movdqa [FP_+ksTwk ],xmm1 ;save the expanded tweak schedule on the stack
+ movdqa [FP_+ksTwk+16],xmm2
+ movdqa [FP_+ksTwk+32],xmm0
+
+ mov esi,[ebx+blkPtr] ;esi --> input block
+ mov eax,KW_PARITY_LO ;init key schedule parity accumulator
+ mov edx,KW_PARITY_HI
+ movd xmm7,eax
+ movd xmm6,edx
+ unpcklps xmm7,xmm6 ;pack two 32-bit words into xmm7
+;
+ lea eax,[esp+80h] ;use short offsets for Wcopy, X_stk writes below
+SP_ equ <eax-80h> ;[eax+OFFS] mode is one byte shorter than [esp+OFFS]
+irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0>
+ mov64 xmm6,ptr64 [ctx+X_VARS+8*_NN_]
+ xorpd xmm7,xmm6 ;update overall parity
+ movdqa [FP_+ksKey+16*_NN_],xmm6;save the key schedule on the stack
+ if _NN_ lt R_1024_REGS
+ _rr_ = _NN_
+ else
+ _rr_ = R_1024_REGS
+ endif
+ irp _rn_,<%(_rr_)>
+ mov64 xmm&_rn_,ptr64 [esi+ 8*_NN_];save copy of the input block on stack
+ mov64 ptr64 [SP_+ Wcopy + 8*_NN_],xmm&_rn_ ;(for feedforward later)
+ paddq xmm&_rn_,xmm6 ;inject the key into the block
+ if _NN_ eq 13
+ paddq xmm&_rn_,xmm1 ;inject the initial tweak words
+ elseif _NN_ eq 14
+ paddq xmm&_rn_,xmm2
+ endif
+ if _NN_ ge R_1024_REGS ;only save X[5..15] on stack, leave X[0..4] in regs
+ mov64 ptr64 [SP_+X_stk+8*_NN_],xmm&_rn_
+ endif
+ endm
+endm
+ movdqa [FP_+ksKey+16*WCNT],xmm7;save overall key parity at the end of the array
+;
+if _SKEIN_DEBUG ;debug dump of state at this point
+ Skein_Debug_Block 1024
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,saveRegs
+endif
+ add esi, WCNT*8 ;skip to the next block
+ mov [ebx+blkPtr],esi ;save the updated block pointer
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+ xor edx,edx ;edx = round counter
+if SKEIN_ASM_UNROLL and 1024
+_UNROLL_CNT = ROUNDS_1024/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ if ((ROUNDS_1024/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_1024"
+ endif
+ mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey
+Skein_1024_round_loop:
+endif
+;
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ R_1024_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+endm ;rept _UNROLL_CNT
+;
+if (SKEIN_ASM_UNROLL and 1024) eq 0
+ cmp edx,2*(ROUNDS_1024/8)
+ jb Skein_1024_round_loop
+endif
+ and ptr08 [ctx +TWEAK +15],FIRST_MASK8 ;clear tweak bit for next time thru
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ lea eax,[esp+80h] ;allow short offsets to X_stk and Wcopy
+irp _NN_,<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+ if _NN_ lt R_1024_REGS
+ if _NN_ and 1 ;already in regs: no load needed
+ mov64 xmm7 ,ptr64 [SP_+ Wcopy + 8*_NN_] ;unaligned
+ xorpd xmm&_NN_,xmm7
+ else
+ xorpd xmm&_NN_, [SP_+ Wcopy + 8*_NN_] ;aligned
+ endif
+ mov64 ptr64 [ctx+ X_vars+ 8*_NN_],xmm&_NN_
+ else
+ mov64 xmm7,ptr64 [SP_+ X_stk + 8*_NN_] ;load X value from stack
+ if _NN_ and 1
+ mov64 xmm6,ptr64 [SP_+ Wcopy + 8*_NN_] ;unaligned
+ xorpd xmm7,xmm6
+ else
+ xorpd xmm7, [SP_+ Wcopy + 8*_NN_] ;aligned
+ endif
+ mov64 ptr64 [ctx+ X_vars+ 8*_NN_],xmm7
+ endif
+endm
+if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD ;no need to save regs on stack here
+endif
+ ; go back for more blocks, if needed
+ dec ecx
+ jnz Skein1024_block_loop
+
+ Reset_Stack _Skein1024_Process_Block
+ ret
+_Skein1024_Process_Block endp
+;
+ifdef _SKEIN_CODE_SIZE
+ public _Skein1024_Process_Block_CodeSize
+_Skein1024_Process_Block_CodeSize proc
+ mov eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block
+ ret
+_Skein1024_Process_Block_CodeSize endp
+;
+ public _Skein1024_Unroll_Cnt
+_Skein1024_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_1024/8
+ mov eax,_UNROLL_CNT
+ else
+ xor eax,eax
+ endif
+ ret
+_Skein1024_Unroll_Cnt endp
+endif
+;
+endif ; _USE_ASM_ and 1024
+;----------------------------------------------------------------
+ end
diff --git a/Additional_Implementations/skein_block_xmm32.s b/Additional_Implementations/skein_block_xmm32.s
new file mode 100644
index 000000000000..fa10bd2b98c1
--- /dev/null
+++ b/Additional_Implementations/skein_block_xmm32.s
@@ -0,0 +1,1110 @@
+#
+#----------------------------------------------------------------
+# 32-bit x86 assembler code for Skein block functions using XMM registers
+#
+# Author: Doug Whiting, Hifn/Exar
+#
+# This code is released to the public domain.
+#----------------------------------------------------------------
+#
+ .text
+ .altmacro #use advanced macro features
+ .psize 0,128 #list file has no page boundaries
+#
+_MASK_ALL_ = (256+512+1024) #all three algorithm bits
+SAVE_REGS = 1
+#
+#################
+.ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+.elseif SKEIN_USE_ASM & _MASK_ALL_
+_USE_ASM_ = SKEIN_USE_ASM
+.else
+_USE_ASM_ = _MASK_ALL_
+.endif
+#
+#################
+.ifndef SKEIN_LOOP
+_SKEIN_LOOP = 002 #default is all fully unrolled, except Skein1024
+.else
+_SKEIN_LOOP = SKEIN_LOOP
+.endif
+#--------------
+# the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10
+#
+SKEIN_ASM_UNROLL = 0
+ .irp _NN_,256,512,1024
+ .if (SKEIN_UNROLL_\_NN_) == 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_
+ .endif
+ .endr
+#
+#################
+#
+.ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+.else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)
+.irp _NN_,256,512,1024
+ .if _USE_ASM_ && \_NN_
+ .irp _RR_,%(ROUNDS_\_NN_)
+ .if \_NN_ < 1024
+.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+ .else
+.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+ .endif
+ .endr
+ .endif
+.endr
+.endif
+#################
+#
+.ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE = (1)
+.else
+.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE = (1)
+.endif
+.endif
+#
+#################
+#
+.ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+.else
+_SKEIN_DEBUG = 1
+.endif
+#################
+#
+# define offsets of fields in hash context structure
+#
+HASH_BITS = 0 ## bits of hash output
+BCNT = 4 + HASH_BITS #number of bytes in BUFFER[]
+TWEAK = 4 + BCNT #tweak values[0..1]
+X_VARS = 16 + TWEAK #chaining vars
+#
+#(Note: buffer[] in context structure is NOT needed here :-)
+#
+KW_PARITY_LO= 0xA9FC1A22 #overall parity of key schedule words (hi32/lo32)
+KW_PARITY_HI= 0x1BD11BDA
+FIRST_MASK8 = ~ (1 << 6) #FIRST block flag bit
+#
+# rotation constants for Skein
+#
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+#
+#----------------------------------------------------------------
+# declare allocated space on the stack
+.macro StackVar localName,localSize
+\localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(\localSize)
+.endm #StackVar
+#
+#----------------------------------------------------------------
+#
+# MACRO: Configure stack frame, allocate local vars
+#
+.macro Setup_Stack WCNT,RND_CNT
+_STK_OFFS_ = 0 #starting offset from esp, forced on 16-byte alignment
+ #----- local variables #<-- esp
+ StackVar X_stk , 8*(WCNT) #local context vars
+ StackVar Wcopy , 8*(WCNT) #copy of input block
+ StackVar ksTwk ,16*3 #key schedule: tweak words
+ StackVar ksKey ,16*(WCNT)+16#key schedule: key words
+FRAME_OFFS = ksTwk+128 #<-- ebp
+F_O = FRAME_OFFS #syntactic shorthand
+ .if (SKEIN_ASM_UNROLL && (WCNT*64)) == 0
+ StackVar ksRot,16*(RND_CNT/4)#leave space for ks "rotation" to happen
+ .endif
+LOCAL_SIZE = _STK_OFFS_ #size of local vars
+ #
+ #"restart" the stack defns, because we relocate esp to guarantee alignment
+ # (i.e., these vars are NOT at fixed offsets from esp)
+_STK_OFFS_ = 0
+ #-----
+ StackVar savRegs,8*4 #pushad data
+ StackVar retAddr,4 #return address
+ #----- caller parameters
+ StackVar ctxPtr ,4 #context ptr
+ StackVar blkPtr ,4 #pointer to block data
+ StackVar blkCnt ,4 #number of full blocks to process
+ StackVar bitAdd ,4 #bit count to add to tweak
+ #----- caller's stack frame
+#
+# Notes on stack frame setup:
+# * the most used variable (except for Skein-256) is X_stk[], based at [esp+0]
+# * the next most used is the key schedule words
+# so ebp is "centered" there, allowing short offsets to the key/tweak
+# schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-(
+# * the Wcopy variables are infrequently accessed, and they have long
+# offsets from both esp and ebp only in the 1024-bit case.
+# * all other local vars and calling parameters can be accessed
+# with short offsets, except in the 1024-bit case
+#
+ pushal #save all regs
+ movl %esp,%ebx #keep ebx as pointer to caller parms
+ subl $LOCAL_SIZE,%esp #make room for the locals
+ andl $~15,%esp #force alignment
+ movl ctxPtr(%ebx),%edi #edi --> Skein context
+ leal FRAME_OFFS(%esp),%ebp #maximize use of short offsets from ebp
+ movl blkCnt(%ebx),%ecx #keep block cnt in ecx
+.endm #Setup_Stack
+#
+#----------------------------------------------------------------
+#
+.macro Reset_Stack,procStart
+ movl %ebx,%esp #get rid of locals (wipe??)
+ popal #restore all regs
+.endm # Reset_Stack
+#
+#----------------------------------------------------------------
+# macros to help debug internals
+#
+.if _SKEIN_DEBUG
+ .extern _Skein_Show_Block #calls to C routines
+ .extern _Skein_Show_Round
+#
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+#
+.macro Skein_Debug_Block BLK_BITS
+#
+#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+# const u08b_t *blkPtr, const u64b_t *wPtr,
+# const u64b_t *ksPtr,const u64b_t *tsPtr)#
+#
+ call _Put_XMM_\BLK_BITS
+ pushal #save all regs
+ leal ksTwk+1-F_O(%ebp),%eax #+1 = flag: "stride" size = 2 qwords
+ leal ksKey+1-F_O(%ebp),%esi
+ leal Wcopy+32(%esp),%ecx #adjust offset by 32 for pushad
+ movl ctxPtr(%ebx) ,%edx #ctx_hdr_ptr
+ leal X_VARS(%edx) ,%edx #edx ==> cxt->X[]
+ pushl %eax #tsPtr
+ pushl %esi #ksPtr
+ pushl %ecx #wPtr
+ pushl blkPtr(%ebx) #blkPtr
+ pushl %edx #ctx->Xptr
+ pushl ctxPtr(%ebx) #ctx_hdr_ptr
+ movl $\BLK_BITS,%eax
+ pushl %eax #bits
+ call _Skein_Show_Block
+ addl $7*4,%esp #discard parameter space on stack
+ popal #restore regs
+#
+ call _Get_XMM_\BLK_BITS
+.endm #Skein_Debug_Block
+
+#
+.macro Skein_Debug_Round BLK_BITS,R,saveRegs=0
+#
+#void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)#
+#
+ .if \saveRegs
+ call _Put_XMM_\BLK_BITS
+ .endif
+ pushal #save all regs
+ .if R <> SKEIN_RND_FEED_FWD
+ leal 32+X_stk(%esp),%eax #adjust offset by 32 for pushal
+ .else
+ movl ctxPtr(%ebx),%eax
+ addl $X_VARS,%eax
+ .endif
+ pushl %eax #Xptr
+ .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
+ movl $\R,%eax
+ .else #compute round number from edx, R
+ leal 1+(((\R)-1) && 3)(,%edx,4),%eax
+ .endif
+ pushl %eax #round number
+ pushl ctxPtr(%ebx) #ctx_hdr_ptr
+ movl $\BLK_BITS,%eax
+ pushl %eax #bits
+ call _Skein_Show_Round
+ addl $4*4,%esp #discard parameter space on stack
+ popal #restore regs
+ .if \saveRegs
+ call _Get_XMM_\BLK_BITS #save internal vars for debug dump
+ .endif
+.endm #Skein_Debug_Round
+.endif #ifdef SKEIN_DEBUG
+#
+#----------------------------------------------------------------
+# useful macros
+.macro _ldX xn
+ movq X_stk+8*(\xn)(%esp),%xmm\xn
+.endm
+
+.macro _stX xn
+ movq %xmm\xn,X_stk+8*(\xn)(%esp)
+.endm
+#
+#----------------------------------------------------------------
+#
+.macro C_label lName
+ \lName: #use both "genders" to work across linkage conventions
+_\lName:
+ .global \lName
+ .global _\lName
+.endm
+#
+
+.if _USE_ASM_ & 256
+#
+# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+#
+# Skein-256 round macros
+#
+.macro R_256_OneRound _RR_,x0,x1,x2,x3,t0,t1
+ .irp _qq_,%((\_RR_) && 7) #figure out which rotation constants to use
+ .if \x0 == 0
+_RC0_ = RC_256_\_qq_&&_0
+_RC1_ = RC_256_\_qq_&&_1
+ .else
+_RC0_ = RC_256_\_qq_&&_1
+_RC1_ = RC_256_\_qq_&&_0
+ .endif
+ .endr
+#
+ paddq %xmm\x1,%xmm\x0
+ movq %xmm\x1,%xmm\t0
+ psllq $ _RC0_,%xmm\x1
+ psrlq $64-_RC0_,%xmm\t0
+ xorpd %xmm\x0,%xmm\x1
+ xorpd %xmm\t0,%xmm\x1
+#
+ paddq %xmm\x3,%xmm\x2
+ movq %xmm\x3,%xmm\t1
+ psllq $ _RC1_,%xmm\x3
+ psrlq $64-_RC1_,%xmm\t1
+ xorpd %xmm\x2,%xmm\x3
+ xorpd %xmm\t1,%xmm\x3
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 256,%(\_RR_+1),SAVE_REGS
+ .endif
+.endm #R_256_OneRound
+#
+.macro R_256_FourRounds _RN_
+ R_256_OneRound %(_RN_+0),0,1,2,3,4,5
+ R_256_OneRound (_RN_+1),2,1,0,3,4,5
+
+ R_256_OneRound (_RN_+2),0,1,2,3,4,5
+ R_256_OneRound (_RN_+3),2,1,0,3,4,5
+
+ #inject key schedule
+ incl %edx #bump round number
+ movd %edx,%xmm4
+ .if _UNROLL_CNT == (ROUNDS_256/8)
+ #fully unrolled version
+_RK_ = ((_RN_)/4) #key injection counter
+ paddq ksKey+16*((_RK_+1) % 5)-F_O(%ebp),%xmm0
+ paddq ksKey+16*((_RK_+2) % 5)-F_O(%ebp),%xmm1
+ paddq ksKey+16*((_RK_+3) % 5)-F_O(%ebp),%xmm2
+ paddq ksKey+16*((_RK_+4) % 5)-F_O(%ebp),%xmm3
+ paddq ksTwk+16*((_RK_+1) % 3)-F_O(%ebp),%xmm1
+ paddq ksTwk+16*((_RK_+2) % 3)-F_O(%ebp),%xmm2
+ paddq %xmm4,%xmm3
+ .else #looping version
+ paddq ksKey+16*1-F_O(%esi),%xmm0
+ paddq ksKey+16*2-F_O(%esi),%xmm1
+ paddq ksKey+16*3-F_O(%esi),%xmm2
+ paddq ksKey+16*4-F_O(%esi),%xmm3
+ paddq ksTwk+16*1-F_O(%esi),%xmm1
+ paddq ksTwk+16*2-F_O(%esi),%xmm2
+ paddq %xmm4,%xmm3
+#
+ movq ksKey-F_O(%esi),%xmm4 #first, "rotate" key schedule on the stack
+ movq ksTwk-F_O(%esi),%xmm5 # (for next time through)
+ movq %xmm4,ksKey+16*(WCNT+1)-F_O(%esi)
+ movq %xmm5,ksTwk+16*3-F_O(%esi)
+ addl $16,%esi #bump rolling pointer
+ .endif
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,SAVE_REGS
+ .endif
+.endm #R256_FourRounds
+#
+.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
+_Put_XMM_256:
+ .irp _NN_,0,1,2,3
+ movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp)
+ .endr
+ ret
+#
+_Get_XMM_256:
+ .irp _NN_,0,1,2,3
+ movq X_stk+4+_NN_*8(%esp),%xmm\_NN_
+ .endr
+ ret
+.endif
+#
+#################
+#
+# code
+#
+C_label Skein_256_Process_Block
+ WCNT = 4 #WCNT=4 for Skein-256
+ Setup_Stack WCNT,ROUNDS_256
+ # main hash loop for Skein_256
+Skein_256_block_loop:
+ movd bitAdd (%ebx),%xmm4
+ movq TWEAK+0(%edi),%xmm5
+ movq TWEAK+8(%edi),%xmm6
+ paddq %xmm4 ,%xmm5 #bump T0 by the bitAdd parameter
+ movq %xmm5,TWEAK(%edi) #save updated tweak value T0 (for next time)
+ movapd %xmm6,%xmm7
+ xorpd %xmm5,%xmm7 #compute overall tweak parity
+ movdqa %xmm5,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack
+ movdqa %xmm6,ksTwk+16-F_O(%ebp)
+ movdqa %xmm7,ksTwk+32-F_O(%ebp)
+
+ movl blkPtr(%ebx),%esi #esi --> input block
+ movl $KW_PARITY_LO,%eax #init key schedule parity accumulator
+ movl $KW_PARITY_HI,%edx
+ movd %eax ,%xmm4
+ movd %edx ,%xmm0
+ unpcklps %xmm0,%xmm4 #replicate parity dword to 64 bits
+#
+ .irp _NN_,0,1,2,3 #copy in the chaining vars
+ movq X_VARS+8*\_NN_(%edi),%xmm\_NN_
+ xorpd %xmm\_NN_,%xmm4 #update overall parity
+ movdqa %xmm\_NN_,ksKey+16*_NN_-F_O(%ebp)
+ .endr
+ movdqa %xmm4,ksKey+16*WCNT-F_O(%ebp)#save overall parity at the end of the array
+#
+ paddq %xmm5,%xmm1 #inject the initial tweak words
+ paddq %xmm6,%xmm2
+#
+ .irp _NN_,0,1,2,3 #perform the initial key injection
+ movq 8*\_NN_(%esi),%xmm4#and save a copy of the input block on stack
+ movq %xmm4,8*\_NN_+Wcopy(%esp)
+ paddq %xmm4,%xmm\_NN_ #inject the key word
+ .endr
+#
+.if _SKEIN_DEBUG #debug dump of state at this point
+ Skein_Debug_Block 256
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,SAVE_REGS
+.endif
+ addl $WCNT*8,%esi #skip to the next block
+ movl %esi,blkPtr(%ebx) #save the updated block pointer
+ #
+ # now the key schedule is computed. Start the rounds
+ #
+ xorl %edx,%edx #edx = iteration count
+.if SKEIN_ASM_UNROLL & 256
+_UNROLL_CNT = ROUNDS_256/8 #fully unrolled
+.else
+_UNROLL_CNT = SKEIN_UNROLL_256 #partial unroll count
+ .if ((ROUNDS_256/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_256" #sanity check
+ .endif
+ movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey
+Skein_256_round_loop: # (since there's no 16* scaled address mode)
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2 # here with X[0..3] in XMM0..XMM3
+ R_256_FourRounds _Rbase_
+_Rbase_ = _Rbase_+4
+.endr #rept _UNROLL_CNT*2
+#
+ .if _UNROLL_CNT <> (ROUNDS_256/8)
+ cmpl $2*(ROUNDS_256/8),%edx
+ jb Skein_256_round_loop
+ .endif
+ #----------------------------
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ .irp _NN_,0,1,2,3
+ movq Wcopy+8*\_NN_(%esp),%xmm4
+ xorpd %xmm4,%xmm\_NN_
+ movq %xmm\_NN_,X_VARS+8*\_NN_(%edi)
+ .endr
+ andb $FIRST_MASK8,TWEAK +15(%edi)
+.if _SKEIN_DEBUG
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,SAVE_REGS
+.endif
+ # go back for more blocks, if needed
+ decl %ecx
+ jnz Skein_256_block_loop
+ Reset_Stack _Skein_256_Process_Block
+ ret
+#
+.ifdef _SKEIN_CODE_SIZE
+C_label Skein_256_Process_Block_CodeSize
+ movl $_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block,%eax
+ ret
+#
+C_label Skein_256_Unroll_Cnt
+ .if _UNROLL_CNT <> ROUNDS_256/8
+ movl $_UNROLL_CNT,%eax
+ .else
+ xorl %eax,%eax
+ .endif
+ ret
+.endif
+.endif #_USE_ASM_ & 256
+#
+#----------------------------------------------------------------
+#
+.if _USE_ASM_ & 512
+#
+# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+# MACRO: one round
+#
+.macro R_512_Round _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd
+ .irp _qq_,%((\_RR_) && 7)
+_Ra_ = RC_512_\_qq_&&_\Ra
+_Rb_ = RC_512_\_qq_&&_\Rb
+_Rc_ = RC_512_\_qq_&&_\Rc
+_Rd_ = RC_512_\_qq_&&_\Rd
+ .endr
+ paddq %xmm\a1 , %xmm\a0
+ _stX c0
+ movq %xmm\a1 , %xmm\c0
+ psllq $ _Ra_ , %xmm\a1
+ psrlq $64-_Ra_ , %xmm\c0
+ xorpd %xmm\c0 , %xmm\a1
+ xorpd %xmm\a0 , %xmm\a1
+
+ paddq %xmm\b1 , %xmm\b0
+ _stX a0
+ movq %xmm\b1 , %xmm\a0
+ psllq $ _Rb_ , %xmm\b1
+ psrlq $64-_Rb_ , %xmm\a0
+ xorpd %xmm\b0 , %xmm\b1
+ _ldX c0
+ xorpd %xmm\a0 , %xmm\b1
+
+ paddq %xmm\c1 , %xmm\c0
+ movq %xmm\c1 , %xmm\a0
+ psllq $ _Rc_ , %xmm\c1
+ psrlq $64-_Rc_ , %xmm\a0
+ xorpd %xmm\c0 , %xmm\c1
+ xorpd %xmm\a0 , %xmm\c1
+
+ paddq %xmm\d1 , %xmm\d0
+ movq %xmm\d1 , %xmm\a0
+ psllq $ _Rd_ , %xmm\d1
+ psrlq $64-_Rd_ , %xmm\a0
+ xorpd %xmm\a0 , %xmm\d1
+ _ldX a0
+ xorpd %xmm\d0 , %xmm\d1
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 512,%(_RR_+1),SAVE_REGS
+ .endif
+.endm
+#
+# MACRO: four rounds
+.macro R_512_FourRounds _RN_
+ R_512_Round %((_RN_) ), 0,1,0, 2,3,1, 4,5,2, 6,7,3
+ R_512_Round %((_RN_)+1), 2,1,0, 4,7,1, 6,5,2, 0,3,3
+ R_512_Round %((_RN_)+2), 4,1,0, 6,3,1, 0,5,2, 2,7,3
+ R_512_Round %((_RN_)+3), 6,1,0, 0,7,1, 2,5,2, 4,3,3
+
+ #inject key schedule
+.irp _NN_,0,1,2,3,4,5,6,7
+ .if _UNROLL_CNT == (ROUNDS_512/8)
+ paddq ksKey+16*((((\_RN_)/4)+(\_NN_)+1)%9)-F_O(%ebp),%xmm\_NN_
+ .else
+ paddq ksKey+16*((\_NN_)+1)-F_O(%esi),%xmm\_NN_
+ .endif
+.endr
+ _stX 0 #free up a register
+ incl %edx #bump round counter
+ movd %edx,%xmm0 #inject the tweak
+ .if _UNROLL_CNT == (ROUNDS_512/8)
+ paddq ksTwk+16*(((_RN_)+1) % 3)-F_O(%ebp),%xmm5
+ paddq ksTwk+16*(((_RN_)+2) % 3)-F_O(%ebp),%xmm6
+ paddq %xmm0 ,%xmm7
+ .else #looping version
+ paddq ksTwk+16*1-F_O(%esi),%xmm5
+ paddq ksTwk+16*2-F_O(%esi),%xmm6
+ paddq %xmm0 ,%xmm7
+ # "rotate" key schedule on the stack (for next time through)
+ movq ksKey -F_O(%esi),%xmm0
+ movq %xmm0,ksKey+16*(WCNT+1)-F_O(%esi)
+ movq ksTwk -F_O(%esi),%xmm0
+ movq %xmm0,ksTwk+16*3 -F_O(%esi)
+ addl $16,%esi #bump rolling pointer
+ .endif
+ _ldX 0 #restore X0
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,SAVE_REGS
+ .endif
+.endm #R_512_FourRounds
+#################
+.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
+_Put_XMM_512:
+ .irp _NN_,0,1,2,3,4,5,6,7
+ movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp)
+ .endr
+ ret
+#
+_Get_XMM_512:
+ .irp _NN_,0,1,2,3,4,5,6,7
+ movq X_stk+4+\_NN_*8(%esp),%xmm\_NN_
+ .endr
+ ret
+.endif
+#
+#################
+#
+C_label Skein_512_Process_Block
+ WCNT = 8 #WCNT=8 for Skein-512
+ Setup_Stack WCNT,ROUNDS_512
+ # main hash loop for Skein_512
+Skein_512_block_loop:
+ movd bitAdd(%ebx) ,%xmm0
+ movq TWEAK+0(%edi),%xmm1
+ movq TWEAK+8(%edi),%xmm2
+ paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter
+ movq %xmm1,TWEAK(%edi) #save updated tweak value T0 (for next time)
+ movq %xmm2,%xmm0
+ xorpd %xmm1,%xmm0 #compute overall tweak parity
+ movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack
+ movdqa %xmm2,ksTwk+16*1-F_O(%ebp)
+ movdqa %xmm0,ksTwk+16*2-F_O(%ebp)
+
+ movl blkPtr(%ebx),%esi #esi --> input block
+ movl $KW_PARITY_LO,%eax #init key schedule parity accumulator
+ movl $KW_PARITY_HI,%edx
+ movd %eax ,%xmm0
+ movd %edx ,%xmm7
+ unpcklps %xmm7,%xmm0 #replicate parity dword to 64 bits
+#
+ .irp _NN_,7,6,5,4,3,2,1 #copy in the chaining vars (skip #0 for now)
+ movq X_VARS+8*\_NN_(%edi),%xmm\_NN_
+ xorpd %xmm\_NN_,%xmm0 #update overall parity
+ movdqa %xmm\_NN_,ksKey+16*\_NN_-F_O(%ebp)
+ .if \_NN_ == 5
+ paddq %xmm1,%xmm5 #inject the initial tweak words
+ paddq %xmm2,%xmm6 # (before they get trashed in %xmm1/2)
+ .endif
+ .endr
+ movq X_VARS(%edi),%xmm4 #handle #0 now
+ xorpd %xmm4,%xmm0 #update overall parity
+ movdqa %xmm4,ksKey+16* 0 -F_O(%ebp) #save the key value in slot #0
+ movdqa %xmm0,ksKey+16*WCNT-F_O(%ebp) #save overall parity at the end of the array
+#
+ movq %xmm4,%xmm0
+ .irp _NN_,7,6,5, 4,3,2,1,0 #perform the initial key injection (except #4)
+ movq 8*\_NN_(%esi),%xmm4 #and save a copy of the input block on stack
+ movq %xmm4,8*\_NN_+Wcopy(%esp)
+ paddq %xmm4,%xmm\_NN_
+ .endr
+ movq 8*4(%esi),%xmm4 #get input block word #4
+ movq %xmm4,8*4+Wcopy(%esp)
+ paddq ksKey+16*4-F_O(%ebp),%xmm4#inject the initial key
+#
+.if _SKEIN_DEBUG #debug dump of state at this point
+ Skein_Debug_Block 512
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,SAVE_REGS
+.endif
+ addl $WCNT*8,%esi #skip to the next block
+ movl %esi,blkPtr(%ebx) #save the updated block pointer
+ #
+ # now the key schedule is computed. Start the rounds
+ #
+ xorl %edx,%edx #edx = round counter
+.if SKEIN_ASM_UNROLL & 512
+_UNROLL_CNT = ROUNDS_512/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ .if ((ROUNDS_512/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_512"
+ .endif
+ movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey
+Skein_512_round_loop: # (since there's no 16* scaled address mode)
+.endif
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+ R_512_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 512) == 0
+ cmpl $2*(ROUNDS_512/8),%edx
+ jb Skein_512_round_loop
+.endif
+ #----------------------------
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+ andb $FIRST_MASK8,TWEAK +15(%edi)
+.irp _NN_,0,2,4,6 #do the aligned ones first
+ xorpd Wcopy+8*\_NN_(%esp),%xmm\_NN_
+ movq %xmm\_NN_,X_VARS+8*_NN_(%edi)
+.endr
+.irp _NN_,1,3,5,7 #now we have some register space available
+ movq Wcopy+8*\_NN_(%esp),%xmm0
+ xorpd %xmm0,%xmm&\_NN_
+ movq %xmm&\_NN_,X_VARS+8*\_NN_(%edi)
+.endr
+.if _SKEIN_DEBUG
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+.endif
+ # go back for more blocks, if needed
+ decl %ecx
+ jnz Skein_512_block_loop
+
+ Reset_Stack _Skein_512_Process_Block
+ ret
+#
+.ifdef _SKEIN_CODE_SIZE
+C_label Skein_512_Process_Block_CodeSize
+ movl $(_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block),%eax
+ ret
+#
+C_label Skein_512_Unroll_Cnt
+ .if _UNROLL_CNT <> ROUNDS_512/8
+ movl $_UNROLL_CNT,%eax
+ .else
+ xorl %eax,%eax
+ .endif
+ ret
+.endif
+#
+.endif # _USE_ASM_ & 512
+#
+#----------------------------------------------------------------
+#
+.if _USE_ASM_ & 1024
+ .global _Skein1024_Process_Block
+#
+# void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+R_1024_REGS = (5) #keep this many block variables in registers
+#
+################
+.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines
+_Put_XMM_1024:
+_NN_ = 0
+ .rept R_1024_REGS
+ .irp _rr_,%(_NN_)
+ movq %xmm\_rr_,X_stk+4+8*_NN_(%esp)
+ .endr
+_NN_ = _NN_+1
+ .endr
+ ret
+#
+_Get_XMM_1024:
+_NN_ = 0
+ .rept R_1024_REGS
+ .irp _rr_,%(_NN_)
+ movq X_stk+4+8*_NN_(%esp),%xmm\_rr_
+ .endr
+_NN_ = _NN_+1
+ .endr
+ ret
+.endif
+#
+#################
+# MACRO: one mix step
+.macro MixStep_1024 x0,x1,rotIdx0,rotIdx1,_debug_=0
+_r0_ = \x0 #default, if already loaded
+_r1_ = \x1
+ # load the regs (if necessary)
+ .if (\x0 >= R_1024_REGS)
+_r0_ = 5
+ movq X_stk+8*(\x0)(%esp),%xmm5
+ .endif
+ .if (\x1 >= R_1024_REGS)
+_r1_ = 6
+ movq X_stk+8*(\x1)(%esp),%xmm6
+ .endif
+ # do the mix
+ .irp _rx_,%((rotIdx0) && 7)
+_Rc_ = RC_1024_\_rx_&&_\rotIdx1 #rotation constant
+ .endr
+ .irp _x0_,%_r0_
+ .irp _x1_,%_r1_
+ paddq %xmm\_x1_,%xmm\_x0_
+ movq %xmm\_x1_,%xmm7
+ psllq $ _Rc_ ,%xmm\_x1_
+ psrlq $64-_Rc_ ,%xmm7
+ xorpd %xmm\_x0_,%xmm\_x1_
+ xorpd %xmm7 ,%xmm\_x1_
+ .endr
+ .endr
+ # save the regs (if necessary)
+ .if (\x0 >= R_1024_REGS)
+ movq %xmm5,X_stk+8*(\x0)(%esp)
+ .endif
+ .if (\x1 >= R_1024_REGS)
+ movq %xmm6,X_stk+8*(\x1)(%esp)
+ .endif
+ # debug output
+ .if _SKEIN_DEBUG && (\_debug_)
+ Skein_Debug_Round 1024,%((\RotIdx0)+1),SAVE_REGS
+ .endif
+.endm
+#################
+# MACRO: four rounds
+#
+.macro R_1024_FourRounds _RR_
+ #--------- round _RR_
+ MixStep_1024 0, 1,%((\_RR_)+0),0
+ MixStep_1024 2, 3,%((\_RR_)+0),1
+ MixStep_1024 4, 5,%((\_RR_)+0),2
+ MixStep_1024 6, 7,%((\_RR_)+0),3
+ MixStep_1024 8, 9,%((\_RR_)+0),4
+ MixStep_1024 10,11,%((\_RR_)+0),5
+ MixStep_1024 12,13,%((\_RR_)+0),6
+ MixStep_1024 14,15,%((\_RR_)+0),7,1
+ #--------- round _RR_+1
+ MixStep_1024 0, 9,%((\_RR_)+1),0
+ MixStep_1024 2,13,%((\_RR_)+1),1
+ MixStep_1024 6,11,%((\_RR_)+1),2
+ MixStep_1024 4,15,%((\_RR_)+1),3
+ MixStep_1024 10, 7,%((\_RR_)+1),4
+ MixStep_1024 12, 3,%((\_RR_)+1),5
+ MixStep_1024 14, 5,%((\_RR_)+1),6
+ MixStep_1024 8, 1,%((\_RR_)+1),7,1
+ #--------- round _RR_+2
+ MixStep_1024 0, 7,%((\_RR_)+2),0
+ MixStep_1024 2, 5,%((\_RR_)+2),1
+ MixStep_1024 4, 3,%((\_RR_)+2),2
+ MixStep_1024 6, 1,%((\_RR_)+2),3
+ MixStep_1024 12,15,%((\_RR_)+2),4
+ MixStep_1024 14,13,%((\_RR_)+2),5
+ MixStep_1024 8,11,%((\_RR_)+2),6
+ MixStep_1024 10, 9,%((\_RR_)+2),7,1
+ #--------- round _RR_+3
+ MixStep_1024 0,15,%((\_RR_)+3),0
+ MixStep_1024 2,11,%((\_RR_)+3),1
+ MixStep_1024 6,13,%((\_RR_)+3),2
+ MixStep_1024 4, 9,%((\_RR_)+3),3
+ MixStep_1024 14, 1,%((\_RR_)+3),4
+ MixStep_1024 8, 5,%((\_RR_)+3),5
+ MixStep_1024 10, 3,%((\_RR_)+3),6
+ MixStep_1024 12, 7,%((\_RR_)+3),7,1
+
+ incl %edx #edx = round number
+ movd %edx,%xmm7
+
+ #inject the key
+.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+ .if _UNROLL_CNT <> (ROUNDS_1024/8)
+ .if \_NN_ < R_1024_REGS
+ paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm&\_NN_
+ .else
+ movq X_stk+ 8*\_NN_(%esp),%xmm6
+ .if \_NN_ == 15
+ paddq %xmm7,%xmm6
+ .elseif \_NN_ == 14
+ paddq ksTwk+16*2-F_O(%esi),%xmm6
+ .elseif \_NN_ == 13
+ paddq ksTwk+16*1-F_O(%esi),%xmm6
+ .endif
+ paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm6
+ movq %xmm6,X_stk+ 8*\_NN_(%esp)
+ .endif
+ .else
+ .if \_NN_ < R_1024_REGS
+ paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm&\_NN_
+ .else
+ movq X_stk+ 8*\_NN_(%esp), %xmm6
+ paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm6
+ .if \_NN_ == 15
+ paddq %xmm7,%xmm6
+ .elseif \_NN_ == 14
+ paddq ksTwk+16*(((_Rbase_/4)+2) % 3)-F_O(%ebp),%xmm6
+ .elseif \_NN_ == 13
+ paddq ksTwk+16*(((_Rbase_/4)+1) % 3)-F_O(%ebp),%xmm6
+ .endif
+ movq %xmm6,X_stk+ 8*\_NN_(%esp)
+ .endif
+ .endif
+.endr
+ .if _UNROLL_CNT <> (ROUNDS_1024/8) #rotate the key schedule on the stack
+ movq ksKey-F_O(%esi), %xmm6
+ movq ksTwk-F_O(%esi), %xmm7
+ movq %xmm6,ksKey+16*(WCNT+1)-F_O(%esi)
+ movq %xmm7,ksTwk+16* 3 -F_O(%esi)
+ addl $16,%esi #bump rolling pointer
+ .endif
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,SAVE_REGS
+ .endif
+.endm #R_1024_FourRounds
+#
+################
+#
+C_label Skein1024_Process_Block
+#
+ WCNT = 16 #WCNT=16 for Skein-1024
+ Setup_Stack WCNT,ROUNDS_1024
+ addl $0x80,%edi #bias the edi ctxt offsets to keep them all short
+ # main hash loop for Skein1024
+Skein1024_block_loop:
+ movd bitAdd(%ebx) ,%xmm0
+ movq TWEAK+0-0x80(%edi),%xmm1
+ movq TWEAK+8-0x80(%edi),%xmm2
+ paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter
+ movq %xmm1,TWEAK-0x80(%edi) #save updated tweak value T0 (for next time)
+ movq %xmm2,%xmm0
+ xorpd %xmm1,%xmm0 #compute overall tweak parity
+ movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack
+ movdqa %xmm2,ksTwk+16-F_O(%ebp)
+ movdqa %xmm0,ksTwk+32-F_O(%ebp)
+
+ movl blkPtr(%ebx),%esi #esi --> input block
+ movl $KW_PARITY_LO,%eax #init key schedule parity accumulator
+ movl $KW_PARITY_HI,%edx
+ movd %eax ,%xmm7
+ movd %edx ,%xmm6
+ unpcklps %xmm6,%xmm7 #replicate parity dword to 64 bits
+#
+ leal 0x80(%esp),%eax #use short offsets for Wcopy, X_stk writes below
+.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+ movq X_VARS+8*\_NN_-0x80(%edi),%xmm6
+ xorpd %xmm6,%xmm7 #update overall parity
+ movdqa %xmm6,ksKey+16*\_NN_-F_O(%ebp) #save the key schedule on the stack
+ .if \_NN_ < R_1024_REGS
+ _rr_ = \_NN_
+ .else
+ _rr_ = R_1024_REGS
+ .endif
+ .irp _rn_,%(_rr_)
+ movq 8*\_NN_(%esi),%xmm\_rn_ #save copy of the input block on stack
+ movq %xmm\_rn_,Wcopy+8*\_NN_-0x80(%eax) #(for feedforward later)
+ paddq %xmm6,%xmm\_rn_ #inject the key into the block
+ .if \_NN_ == 13
+ paddq %xmm1,%xmm\_rn_ #inject the initial tweak words
+ .elseif \_NN_ == 14
+ paddq %xmm2,%xmm\_rn_
+ .endif
+ .if \_NN_ >= R_1024_REGS #only save X[5..15] on stack, leave X[0..4] in regs
+ movq %xmm\_rn_,X_stk+8*\_NN_-0x80(%eax)
+ .endif
+ .endr
+.endr
+ movdqa %xmm7,ksKey+16*WCNT-F_O(%ebp) #save overall key parity at the end of the array
+#
+.if _SKEIN_DEBUG #debug dump of state at this point
+ Skein_Debug_Block 1024
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,SAVE_REGS
+.endif
+ addl $WCNT*8,%esi #skip to the next block
+ movl %esi,blkPtr(%ebx) #save the updated block pointer
+ #
+ # now the key schedule is computed. Start the rounds
+ #
+ xorl %edx,%edx #edx = round counter
+.if SKEIN_ASM_UNROLL & 1024
+_UNROLL_CNT = ROUNDS_1024/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ .if ((ROUNDS_1024/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_1024"
+ .endif
+ movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey
+Skein_1024_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+ R_1024_FourRounds %_Rbase_
+_Rbase_ = _Rbase_+4
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 1024) == 0
+ cmp $2*(ROUNDS_1024/8),%edx
+ jb Skein_1024_round_loop
+.endif
+ andb $FIRST_MASK8,TWEAK +15-0x80(%edi) #clear tweak bit for next time thru
+ #----------------------------
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ leal 0x80(%esp),%eax #allow short offsets to X_stk and Wcopy
+.irp _NN_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ .if \_NN_ < R_1024_REGS
+ .if \_NN_ && 1 #already in regs: no load needed
+ movq Wcopy+ 8*\_NN_-0x80(%eax),%xmm7 #unaligned
+ xorpd %xmm7,%xmm\_NN_
+ .else
+ xorpd Wcopy+ 8*\_NN_-0x80(%eax),%xmm\_NN_ #aligned
+ .endif
+ movq %xmm\_NN_,X_VARS+8*\_NN_-0x80(%edi)
+ .else
+ movq X_stk+8*\_NN_-0x80(%eax),%xmm7 #load X value from stack
+ .if \_NN_ && 1
+ movq Wcopy+8*\_NN_-0x80(%eax),%xmm6 #unaligned
+ xorpd %xmm6,%xmm7
+ .else
+ xorpd Wcopy+8*\_NN_-0x80(%eax),%xmm7 #aligned
+ .endif
+ movq %xmm7,X_VARS+8*\_NN_-0x80(%edi)
+ .endif
+.endr
+.if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD #no need to save regs on stack here
+.endif
+ # go back for more blocks, if needed
+ decl %ecx
+ jnz Skein1024_block_loop
+
+ Reset_Stack _Skein1024_Process_Block
+ ret
+#
+.ifdef _SKEIN_CODE_SIZE
+C_label Skein1024_Process_Block_CodeSize
+ movl $(_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block),%eax
+ ret
+#
+C_label Skein1024_Unroll_Cnt
+ .if _UNROLL_CNT <> ROUNDS_1024/8
+ movl $_UNROLL_CNT,%eax
+ .else
+ xorl %eax,%eax
+ .endif
+ ret
+.endif
+#
+.endif # _USE_ASM_ & 1024
+#----------------------------------------------------------------
+ .end
diff --git a/Additional_Implementations/skein_perf_core2.txt b/Additional_Implementations/skein_perf_core2.txt
new file mode 100644
index 000000000000..d8b795675c3d
--- /dev/null
+++ b/Additional_Implementations/skein_perf_core2.txt
@@ -0,0 +1,1440 @@
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:27:59,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 3450.00 3450.00 | 8718.00 8718.00 | 41700.00 41730.00 | //: 32-bit, GCC_v3.42 [ C =...]
+ 2_ || 1719.00 1725.00 | 4323.00 4326.00 | 20835.00 20850.00 | //: 32-bit, GCC_v3.42 [ C =...]
+ 4_ || 861.00 861.00 | 2149.50 2151.00 | 10408.50 11277.00 | //: 32-bit, GCC_v3.42 [ C =...]
+ 8_ || 429.00 429.75 | 1074.75 1074.75 | 5204.25 5205.00 | //: 32-bit, GCC_v3.42 [ C =...]
+ 10_ || 343.80 344.40 | 865.80 866.40 | 4167.00 4167.60 | //: 32-bit, GCC_v3.42 [ C =...]
+ 16_ || 214.88 214.88 | 538.50 538.50 | 2603.25 2603.63 | //: 32-bit, GCC_v3.42 [ C =...]
+ 32_ || 107.06 115.88 | 269.25 269.25 | 1301.25 1301.25 | //: 32-bit, GCC_v3.42 [ C =...]
+ 64_ || 85.31 85.41 | 132.66 132.75 | 650.53 650.63 | //: 32-bit, GCC_v3.42 [ C =...]
+ 100_ || 82.20 88.86 | 126.78 126.78 | 416.46 416.46 | //: 32-bit, GCC_v3.42 [ C =...]
+ 128_ || 69.42 69.56 | 97.83 97.83 | 324.98 325.03 | //: 32-bit, GCC_v3.42 [ C =...]
+ 256_ || 56.70 56.74 | 76.34 76.34 | 242.95 242.98 | //: 32-bit, GCC_v3.42 [ C =...]
+ 512_ || 53.06 53.12 | 65.50 65.53 | 200.66 200.67 | //: 32-bit, GCC_v3.42 [ C =...]
+ 1000_ || 52.33 52.42 | 61.66 61.69 | 183.89 183.92 | //: 32-bit, GCC_v3.42 [ C =...]
+ 1024_ || 51.15 51.23 | 60.07 60.08 | 179.52 179.55 | //: 32-bit, GCC_v3.42 [ C =...]
+ 2048_ || 50.20 50.30 | 57.36 57.42 | 168.86 168.97 | //: 32-bit, GCC_v3.42 [ C =...]
+ 4096_ || 49.71 49.77 | 56.00 56.01 | 163.65 166.96 | //: 32-bit, GCC_v3.42 [ C =...]
+ 8192_ || 49.48 50.94 | 55.33 57.07 | 169.60 184.62 | //: 32-bit, GCC_v3.42 [ C =...]
+ 10000_ || 53.64 53.70 | 60.89 60.99 | 186.13 186.98 | //: 32-bit, GCC_v3.42 [ C =...]
+ 16384_ || 53.48 53.80 | 60.35 60.70 | 164.26 167.23 | //: 32-bit, GCC_v3.42 [ C =...]
+ 32768_ || 53.47 53.50 | 60.22 60.37 | 164.15 182.33 | //: 32-bit, GCC_v3.42 [ C =...]
+ 100000_ || 53.85 100.93 | 60.43 61.36 | 164.25 169.78 | //: 32-bit, GCC_v3.42 [ C =...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [ C =...]
+ Block || 14464 bytes | 32544 bytes | 83024 bytes | //: 32-bit, GCC_v3.42 [ C =...]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:28:12,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2802.00 2814.00 | 5952.00 5952.00 | 30606.00 30606.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 2_ || 1392.00 1395.00 | 2976.00 2979.00 | 15309.00 15309.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 4_ || 696.00 697.50 | 1486.50 1486.50 | 7653.00 7654.50 | //: 32-bit, MSC_v9.00 [ C =...]
+ 8_ || 347.25 348.00 | 741.75 742.50 | 3825.75 3827.25 | //: 32-bit, MSC_v9.00 [ C =...]
+ 10_ || 278.40 278.40 | 593.40 593.40 | 3063.00 3063.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 16_ || 174.38 174.38 | 370.50 370.50 | 1913.25 1913.25 | //: 32-bit, MSC_v9.00 [ C =...]
+ 32_ || 86.25 86.25 | 186.00 186.75 | 957.00 957.19 | //: 32-bit, MSC_v9.00 [ C =...]
+ 64_ || 62.91 62.91 | 92.91 92.91 | 478.50 478.50 | //: 32-bit, MSC_v9.00 [ C =...]
+ 100_ || 65.52 65.58 | 88.02 88.08 | 306.30 306.30 | //: 32-bit, MSC_v9.00 [ C =...]
+ 128_ || 50.72 50.72 | 68.53 68.58 | 238.64 238.88 | //: 32-bit, MSC_v9.00 [ C =...]
+ 256_ || 44.88 45.05 | 56.11 56.13 | 178.17 178.24 | //: 32-bit, MSC_v9.00 [ C =...]
+ 512_ || 41.79 41.86 | 49.79 49.91 | 147.39 147.47 | //: 32-bit, MSC_v9.00 [ C =...]
+ 1000_ || 41.26 41.41 | 47.96 47.96 | 135.28 135.29 | //: 32-bit, MSC_v9.00 [ C =...]
+ 1024_ || 40.40 40.40 | 46.79 46.81 | 132.05 132.08 | //: 32-bit, MSC_v9.00 [ C =...]
+ 2048_ || 39.62 39.62 | 45.23 45.23 | 124.39 124.40 | //: 32-bit, MSC_v9.00 [ C =...]
+ 4096_ || 38.98 38.99 | 44.34 44.44 | 120.58 120.60 | //: 32-bit, MSC_v9.00 [ C =...]
+ 8192_ || 38.83 38.87 | 44.06 47.57 | 118.65 119.36 | //: 32-bit, MSC_v9.00 [ C =...]
+ 10000_ || 38.86 39.08 | 44.13 44.21 | 119.88 120.11 | //: 32-bit, MSC_v9.00 [ C =...]
+ 16384_ || 38.74 39.03 | 43.76 44.01 | 108.36 117.94 | //: 32-bit, MSC_v9.00 [ C =...]
+ 32768_ || 36.77 38.19 | 41.28 41.57 | 105.50 114.79 | //: 32-bit, MSC_v9.00 [ C =...]
+ 100000_ || 38.85 39.09 | 43.56 43.77 | 105.79 114.18 | //: 32-bit, MSC_v9.00 [ C =...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+ Block || 10192 bytes | 22960 bytes | 53072 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:28:29,Oct 7 2008 by 'MSC_v6.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 8688.00 8712.00 | 26466.00 26472.00 | 61638.00 61680.00 | //: 32-bit, MSC_v6.00 [ C =...]
+ 2_ || 4347.00 4362.00 | 13293.00 13302.00 | 30036.00 30372.00 | //: 32-bit, MSC_v6.00 [ C =...]
+ 4_ || 2184.00 2199.00 | 6457.50 6508.50 | 15267.00 15285.00 | //: 32-bit, MSC_v6.00 [ C =...]
+ 8_ || 1093.50 1098.75 | 3227.25 3227.25 | 7398.75 7467.75 | //: 32-bit, MSC_v6.00 [ C =...]
+ 10_ || 873.60 878.40 | 2405.40 2574.00 | 5661.00 5668.20 | //: 32-bit, MSC_v6.00 [ C =...]
+ 16_ || 522.00 524.25 | 1455.00 1455.38 | 3459.38 3489.38 | //: 32-bit, MSC_v6.00 [ C =...]
+ 32_ || 260.06 261.00 | 727.69 732.56 | 1727.44 1728.00 | //: 32-bit, MSC_v6.00 [ C =...]
+ 64_ || 186.66 186.94 | 362.16 362.25 | 848.25 856.97 | //: 32-bit, MSC_v6.00 [ C =...]
+ 100_ || 194.10 194.10 | 344.52 344.58 | 542.22 545.28 | //: 32-bit, MSC_v6.00 [ C =...]
+ 128_ || 151.27 151.36 | 266.06 268.59 | 426.23 426.33 | //: 32-bit, MSC_v6.00 [ C =...]
+ 256_ || 137.67 137.91 | 219.66 219.68 | 314.74 317.74 | //: 32-bit, MSC_v6.00 [ C =...]
+ 512_ || 130.21 130.22 | 195.96 218.33 | 263.45 266.96 | //: 32-bit, MSC_v6.00 [ C =...]
+ 1000_ || 129.17 129.60 | 183.96 185.04 | 237.88 240.37 | //: 32-bit, MSC_v6.00 [ C =...]
+ 1024_ || 126.35 126.53 | 178.10 178.44 | 251.47 256.21 | //: 32-bit, MSC_v6.00 [ C =...]
+ 2048_ || 133.77 133.81 | 190.95 191.16 | 240.21 242.87 | //: 32-bit, MSC_v6.00 [ C =...]
+ 4096_ || 116.36 124.81 | 169.39 178.51 | 220.31 222.41 | //: 32-bit, MSC_v6.00 [ C =...]
+ 8192_ || 123.60 125.60 | 171.36 174.54 | 215.79 233.44 | //: 32-bit, MSC_v6.00 [ C =...]
+ 10000_ || 124.42 127.19 | 183.83 188.08 | 236.37 238.04 | //: 32-bit, MSC_v6.00 [ C =...]
+ 16384_ || 133.15 133.46 | 172.38 183.65 | 222.17 232.39 | //: 32-bit, MSC_v6.00 [ C =...]
+ 32768_ || 126.67 128.66 | 180.67 186.04 | 225.01 227.24 | //: 32-bit, MSC_v6.00 [ C =...]
+ 100000_ || 123.83 125.06 | 172.26 186.33 | 205.87 224.72 | //: 32-bit, MSC_v6.00 [ C =...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1486 bytes | 1348 bytes | 1445 bytes | //: 32-bit, MSC_v6.00 [ C =...]
+ Block || 14094 bytes | 35580 bytes | 69258 bytes | //: 32-bit, MSC_v6.00 [ C =...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:28:44,Oct 7 2008 by 'MSC_v4.20', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 5028.00 5058.00 | 9948.00 10044.00 | 35466.00 35520.00 | //: 32-bit, MSC_v4.20 [ C =...]
+ 2_ || 2508.00 2511.00 | 5070.00 5076.00 | 18090.00 18132.00 | //: 32-bit, MSC_v4.20 [ C =...]
+ 4_ || 1255.50 1255.50 | 2523.00 2523.00 | 9063.00 9063.00 | //: 32-bit, MSC_v4.20 [ C =...]
+ 8_ || 627.75 627.75 | 1261.50 1261.50 | 4536.00 4536.00 | //: 32-bit, MSC_v4.20 [ C =...]
+ 10_ || 502.20 502.20 | 1013.40 1014.60 | 3685.80 3685.80 | //: 32-bit, MSC_v4.20 [ C =...]
+ 16_ || 313.88 313.88 | 624.75 632.63 | 2284.88 2287.88 | //: 32-bit, MSC_v4.20 [ C =...]
+ 32_ || 155.25 155.25 | 312.38 312.38 | 1143.75 1143.75 | //: 32-bit, MSC_v4.20 [ C =...]
+ 64_ || 114.56 114.66 | 155.72 155.72 | 569.91 569.91 | //: 32-bit, MSC_v4.20 [ C =...]
+ 100_ || 120.66 120.78 | 148.92 148.98 | 363.60 363.66 | //: 32-bit, MSC_v4.20 [ C =...]
+ 128_ || 93.84 93.89 | 116.58 116.63 | 284.58 284.58 | //: 32-bit, MSC_v4.20 [ C =...]
+ 256_ || 83.46 83.48 | 95.20 95.20 | 213.77 213.77 | //: 32-bit, MSC_v4.20 [ C =...]
+ 512_ || 78.18 78.19 | 85.08 85.09 | 177.38 177.38 | //: 32-bit, MSC_v4.20 [ C =...]
+ 1000_ || 77.42 77.42 | 81.88 81.88 | 161.92 161.92 | //: 32-bit, MSC_v4.20 [ C =...]
+ 1024_ || 75.54 75.55 | 79.53 79.53 | 158.23 158.23 | //: 32-bit, MSC_v4.20 [ C =...]
+ 2048_ || 74.22 74.23 | 77.37 77.38 | 148.59 149.28 | //: 32-bit, MSC_v4.20 [ C =...]
+ 4096_ || 73.56 73.57 | 76.51 76.51 | 142.82 145.02 | //: 32-bit, MSC_v4.20 [ C =...]
+ 8192_ || 73.23 73.24 | 73.69 74.43 | 143.02 143.30 | //: 32-bit, MSC_v4.20 [ C =...]
+ 10000_ || 73.30 73.36 | 73.75 75.53 | 144.04 144.80 | //: 32-bit, MSC_v4.20 [ C =...]
+ 16384_ || 69.35 73.11 | 71.79 73.01 | 131.31 140.35 | //: 32-bit, MSC_v4.20 [ C =...]
+ 32768_ || 73.05 73.36 | 72.43 74.54 | 126.01 139.95 | //: 32-bit, MSC_v4.20 [ C =...]
+ 100000_ || 69.28 70.12 | 66.33 67.35 | 129.68 136.97 | //: 32-bit, MSC_v4.20 [ C =...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1152 bytes | 1024 bytes | 1088 bytes | //: 32-bit, MSC_v4.20 [ C =...]
+ Block || 11968 bytes | 23776 bytes | 55360 bytes | //: 32-bit, MSC_v4.20 [ C =...]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:28:57,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 780.00 786.00 | 1110.00 1110.00 | 3288.00 3318.00 | //: 64-bit, MSC_v9.00 [ C =...]
+ 2_ || 402.00 402.00 | 549.00 552.00 | 1659.00 1659.00 | //: 64-bit, MSC_v9.00 [ C =...]
+ 4_ || 199.50 201.00 | 274.50 276.00 | 829.50 829.50 | //: 64-bit, MSC_v9.00 [ C =...]
+ 8_ || 96.75 97.50 | 134.25 135.00 | 414.75 414.75 | //: 64-bit, MSC_v9.00 [ C =...]
+ 10_ || 78.60 79.80 | 109.80 109.80 | 331.20 331.80 | //: 64-bit, MSC_v9.00 [ C =...]
+ 16_ || 48.38 48.38 | 67.13 67.13 | 224.25 224.63 | //: 64-bit, MSC_v9.00 [ C =...]
+ 32_ || 26.63 26.81 | 36.38 36.38 | 112.31 112.31 | //: 64-bit, MSC_v9.00 [ C =...]
+ 64_ || 17.06 17.06 | 16.78 16.78 | 51.66 51.75 | //: 64-bit, MSC_v9.00 [ C =...]
+ 100_ || 16.74 16.80 | 15.54 15.54 | 33.30 33.30 | //: 64-bit, MSC_v9.00 [ C =...]
+ 128_ || 12.98 13.08 | 11.95 12.00 | 25.78 25.83 | //: 64-bit, MSC_v9.00 [ C =...]
+ 256_ || 10.99 10.99 | 9.19 9.21 | 19.03 19.03 | //: 64-bit, MSC_v9.00 [ C =...]
+ 512_ || 10.14 10.18 | 7.84 7.85 | 15.60 15.60 | //: 64-bit, MSC_v9.00 [ C =...]
+ 1000_ || 9.88 10.67 | 7.38 7.38 | 14.16 14.17 | //: 64-bit, MSC_v9.00 [ C =...]
+ 1024_ || 9.60 9.64 | 7.18 7.18 | 13.74 13.74 | //: 64-bit, MSC_v9.00 [ C =...]
+ 2048_ || 9.35 9.38 | 6.83 6.83 | 12.84 12.84 | //: 64-bit, MSC_v9.00 [ C =...]
+ 4096_ || 9.28 9.28 | 6.69 6.70 | 12.40 12.40 | //: 64-bit, MSC_v9.00 [ C =...]
+ 8192_ || 9.18 9.21 | 6.58 6.59 | 12.28 12.28 | //: 64-bit, MSC_v9.00 [ C =...]
+ 10000_ || 9.21 9.22 | 6.60 6.60 | 12.27 12.39 | //: 64-bit, MSC_v9.00 [ C =...]
+ 16384_ || 9.19 9.20 | 6.53 6.55 | 12.12 12.12 | //: 64-bit, MSC_v9.00 [ C =...]
+ 32768_ || 9.16 9.17 | 6.51 6.55 | 12.08 12.53 | //: 64-bit, MSC_v9.00 [ C =...]
+ 100000_ || 9.98 10.01 | 7.04 7.08 | 12.36 13.14 | //: 64-bit, MSC_v9.00 [ C =...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+ Block || 2272 bytes | 4944 bytes | 15264 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:28:59,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 6204.00 6252.00 | 11058.00 11124.00 | 25662.00 25788.00 | //: 32-bit, BCC_v5.51 [ C =...]
+ 2_ || 3048.00 3060.00 | 5469.00 5481.00 | 12576.00 12672.00 | //: 32-bit, BCC_v5.51 [ C =...]
+ 4_ || 1515.00 1521.00 | 2731.50 2733.00 | 6303.00 6348.00 | //: 32-bit, BCC_v5.51 [ C =...]
+ 8_ || 756.75 760.50 | 1364.25 1367.25 | 3147.75 3162.75 | //: 32-bit, BCC_v5.51 [ C =...]
+ 10_ || 605.40 607.80 | 1092.60 1095.00 | 2541.60 2545.80 | //: 32-bit, BCC_v5.51 [ C =...]
+ 16_ || 379.50 380.62 | 682.88 683.25 | 1584.38 1590.00 | //: 32-bit, BCC_v5.51 [ C =...]
+ 32_ || 187.88 188.62 | 340.69 341.06 | 794.81 797.62 | //: 32-bit, BCC_v5.51 [ C =...]
+ 64_ || 138.19 138.28 | 169.69 169.78 | 420.75 420.84 | //: 32-bit, BCC_v5.51 [ C =...]
+ 100_ || 145.02 145.08 | 160.80 160.86 | 269.16 269.52 | //: 32-bit, BCC_v5.51 [ C =...]
+ 128_ || 112.92 112.92 | 125.39 125.44 | 210.00 210.23 | //: 32-bit, BCC_v5.51 [ C =...]
+ 256_ || 100.27 100.29 | 103.08 103.08 | 156.33 156.42 | //: 32-bit, BCC_v5.51 [ C =...]
+ 512_ || 93.98 94.00 | 91.90 91.91 | 129.40 129.41 | //: 32-bit, BCC_v5.51 [ C =...]
+ 1000_ || 93.02 93.02 | 88.44 88.44 | 118.61 118.69 | //: 32-bit, BCC_v5.51 [ C =...]
+ 1024_ || 90.80 90.81 | 86.34 86.34 | 115.78 115.86 | //: 32-bit, BCC_v5.51 [ C =...]
+ 2048_ || 89.22 89.22 | 77.11 83.54 | 95.12 95.16 | //: 32-bit, BCC_v5.51 [ C =...]
+ 4096_ || 81.62 81.62 | 75.83 75.83 | 92.17 100.45 | //: 32-bit, BCC_v5.51 [ C =...]
+ 8192_ || 81.80 88.11 | 75.18 78.19 | 90.69 92.09 | //: 32-bit, BCC_v5.51 [ C =...]
+ 10000_ || 81.32 84.70 | 76.65 78.80 | 92.85 94.82 | //: 32-bit, BCC_v5.51 [ C =...]
+ 16384_ || 83.13 83.59 | 76.92 77.00 | 92.05 93.27 | //: 32-bit, BCC_v5.51 [ C =...]
+ 32768_ || 83.07 84.01 | 76.76 77.91 | 92.12 94.08 | //: 32-bit, BCC_v5.51 [ C =...]
+ 100000_ || 83.48 84.08 | 77.08 78.59 | 93.38 102.21 | //: 32-bit, BCC_v5.51 [ C =...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [ C =...]
+ Block || 10732 bytes | 20964 bytes | 45988 bytes | //: 32-bit, BCC_v5.51 [ C =...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:07,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2592.00 2604.00 | 4848.00 4854.00 | 22278.00 22284.00 | //: 32-bit, BCC_v5.51 [asm=...]
+ 2_ || 1287.00 1293.00 | 2430.00 2430.00 | 11139.00 11139.00 | //: 32-bit, BCC_v5.51 [asm=...]
+ 4_ || 637.50 639.00 | 1213.50 1213.50 | 5565.00 5566.50 | //: 32-bit, BCC_v5.51 [asm=...]
+ 8_ || 318.75 319.50 | 606.75 606.75 | 2782.50 2783.25 | //: 32-bit, BCC_v5.51 [asm=...]
+ 10_ || 255.60 255.60 | 486.00 486.60 | 2228.40 2228.40 | //: 32-bit, BCC_v5.51 [asm=...]
+ 16_ || 159.75 159.75 | 301.88 302.25 | 1391.25 1391.62 | //: 32-bit, BCC_v5.51 [asm=...]
+ 32_ || 78.75 78.75 | 151.31 151.31 | 695.44 695.62 | //: 32-bit, BCC_v5.51 [asm=...]
+ 64_ || 55.69 57.28 | 74.81 74.91 | 347.81 347.81 | //: 32-bit, BCC_v5.51 [asm=...]
+ 100_ || 57.42 57.48 | 69.84 69.90 | 222.60 222.60 | //: 32-bit, BCC_v5.51 [asm=...]
+ 128_ || 44.53 44.58 | 54.38 54.38 | 173.67 173.67 | //: 32-bit, BCC_v5.51 [asm=...]
+ 256_ || 38.55 38.55 | 43.99 44.02 | 129.05 129.05 | //: 32-bit, BCC_v5.51 [asm=...]
+ 512_ || 35.60 35.60 | 38.66 38.67 | 106.62 106.62 | //: 32-bit, BCC_v5.51 [asm=...]
+ 1000_ || 34.89 34.89 | 37.18 37.18 | 97.72 97.72 | //: 32-bit, BCC_v5.51 [asm=...]
+ 1024_ || 34.23 34.98 | 35.85 35.86 | 95.40 95.40 | //: 32-bit, BCC_v5.51 [asm=...]
+ 2048_ || 33.86 33.86 | 34.66 34.66 | 89.79 89.80 | //: 32-bit, BCC_v5.51 [asm=...]
+ 4096_ || 33.22 33.59 | 33.92 34.26 | 86.99 86.99 | //: 32-bit, BCC_v5.51 [asm=...]
+ 8192_ || 33.11 33.11 | 33.80 33.92 | 74.64 77.45 | //: 32-bit, BCC_v5.51 [asm=...]
+ 10000_ || 31.46 33.40 | 31.80 32.58 | 78.61 78.66 | //: 32-bit, BCC_v5.51 [asm=...]
+ 16384_ || 31.49 32.78 | 32.16 33.51 | 76.07 76.13 | //: 32-bit, BCC_v5.51 [asm=...]
+ 32768_ || 32.01 32.58 | 32.74 33.18 | 75.73 76.02 | //: 32-bit, BCC_v5.51 [asm=...]
+ 100000_ || 32.23 32.45 | 33.33 61.75 | 84.30 85.34 | //: 32-bit, BCC_v5.51 [asm=...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [asm=...]
+ Block || 7588 bytes | 16636 bytes | 38262 bytes | //: 32-bit, BCC_v5.51 [asm=...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:12,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2484.00 2490.00 | 4830.00 4836.00 | 22182.00 22188.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 2_ || 1254.00 1254.00 | 2415.00 2415.00 | 11091.00 11091.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 4_ || 627.00 627.00 | 1207.50 1207.50 | 5545.50 5545.50 | //: 32-bit, MSC_v9.00 [asm=...]
+ 8_ || 313.50 313.50 | 603.00 603.75 | 2390.25 2478.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 10_ || 250.20 252.00 | 485.40 488.40 | 1936.80 1959.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 16_ || 156.00 156.75 | 301.50 301.50 | 1386.00 1386.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 32_ || 77.81 77.81 | 150.94 151.31 | 692.81 692.81 | //: 32-bit, MSC_v9.00 [asm=...]
+ 64_ || 56.34 56.34 | 74.81 74.81 | 343.78 346.41 | //: 32-bit, MSC_v9.00 [asm=...]
+ 100_ || 58.62 58.68 | 70.74 70.80 | 221.76 221.76 | //: 32-bit, MSC_v9.00 [asm=...]
+ 128_ || 45.47 45.47 | 55.08 55.08 | 168.94 173.02 | //: 32-bit, MSC_v9.00 [asm=...]
+ 256_ || 40.10 40.10 | 44.95 44.95 | 128.88 128.88 | //: 32-bit, MSC_v9.00 [asm=...]
+ 512_ || 37.49 37.55 | 39.94 39.94 | 92.99 92.99 | //: 32-bit, MSC_v9.00 [asm=...]
+ 1000_ || 34.12 34.16 | 35.44 35.44 | 85.27 85.31 | //: 32-bit, MSC_v9.00 [asm=...]
+ 1024_ || 33.30 33.30 | 34.58 34.59 | 83.24 83.25 | //: 32-bit, MSC_v9.00 [asm=...]
+ 2048_ || 32.70 32.70 | 36.20 36.20 | 89.82 89.82 | //: 32-bit, MSC_v9.00 [asm=...]
+ 4096_ || 35.09 35.09 | 35.50 35.57 | 87.04 87.05 | //: 32-bit, MSC_v9.00 [asm=...]
+ 8192_ || 34.83 35.38 | 35.12 35.64 | 76.07 84.71 | //: 32-bit, MSC_v9.00 [asm=...]
+ 10000_ || 34.78 34.98 | 35.36 35.36 | 86.31 86.35 | //: 32-bit, MSC_v9.00 [asm=...]
+ 16384_ || 34.76 34.80 | 35.07 35.36 | 80.55 85.21 | //: 32-bit, MSC_v9.00 [asm=...]
+ 32768_ || 32.88 33.17 | 33.06 33.37 | 75.87 76.15 | //: 32-bit, MSC_v9.00 [asm=...]
+ 100000_ || 32.96 33.40 | 33.29 33.60 | 75.79 76.81 | //: 32-bit, MSC_v9.00 [asm=...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+ Block || 7588 bytes | 16636 bytes | 38262 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:17,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2490.00 2496.00 | 4824.00 4836.00 | 22332.00 22356.00 | //: 32-bit, GCC_v3.42 [asm=...]
+ 2_ || 1251.00 1260.00 | 2412.00 2415.00 | 11157.00 11166.00 | //: 32-bit, GCC_v3.42 [asm=...]
+ 4_ || 621.00 622.50 | 1204.50 1204.50 | 5571.00 5572.50 | //: 32-bit, GCC_v3.42 [asm=...]
+ 8_ || 310.50 311.25 | 602.25 602.25 | 2785.50 2786.25 | //: 32-bit, GCC_v3.42 [asm=...]
+ 10_ || 249.00 249.60 | 482.40 482.40 | 2233.20 2233.80 | //: 32-bit, GCC_v3.42 [asm=...]
+ 16_ || 155.25 155.63 | 300.75 301.50 | 1393.88 1393.88 | //: 32-bit, GCC_v3.42 [asm=...]
+ 32_ || 76.50 77.06 | 151.31 151.31 | 696.38 696.56 | //: 32-bit, GCC_v3.42 [asm=...]
+ 64_ || 55.78 56.06 | 75.19 75.19 | 348.19 348.19 | //: 32-bit, GCC_v3.42 [asm=...]
+ 100_ || 58.32 58.44 | 70.80 70.80 | 222.96 222.96 | //: 32-bit, GCC_v3.42 [asm=...]
+ 128_ || 45.14 45.52 | 55.08 55.13 | 173.72 173.77 | //: 32-bit, GCC_v3.42 [asm=...]
+ 256_ || 40.03 40.13 | 44.91 44.93 | 129.33 129.33 | //: 32-bit, GCC_v3.42 [asm=...]
+ 512_ || 37.38 37.50 | 39.77 39.79 | 106.58 106.66 | //: 32-bit, GCC_v3.42 [asm=...]
+ 1000_ || 36.94 37.03 | 38.19 38.19 | 97.66 97.69 | //: 32-bit, GCC_v3.42 [asm=...]
+ 1024_ || 35.75 36.13 | 37.24 37.24 | 95.29 95.32 | //: 32-bit, GCC_v3.42 [asm=...]
+ 2048_ || 35.36 35.44 | 35.94 35.94 | 88.77 89.67 | //: 32-bit, GCC_v3.42 [asm=...]
+ 4096_ || 35.02 35.02 | 35.31 35.38 | 77.07 86.35 | //: 32-bit, GCC_v3.42 [asm=...]
+ 8192_ || 32.18 32.20 | 32.30 32.31 | 74.72 77.04 | //: 32-bit, GCC_v3.42 [asm=...]
+ 10000_ || 32.28 32.34 | 32.41 32.43 | 78.36 78.77 | //: 32-bit, GCC_v3.42 [asm=...]
+ 16384_ || 32.16 33.29 | 32.20 33.16 | 76.16 78.39 | //: 32-bit, GCC_v3.42 [asm=...]
+ 32768_ || 33.20 33.70 | 33.22 33.33 | 75.84 76.10 | //: 32-bit, GCC_v3.42 [asm=...]
+ 100000_ || 33.26 33.96 | 33.11 33.41 | 75.75 76.29 | //: 32-bit, GCC_v3.42 [asm=...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [asm=...]
+ Block || 7588 bytes | 16636 bytes | 38262 bytes | //: 32-bit, GCC_v3.42 [asm=...]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:29:22,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 672.00 672.00 | 1068.00 1068.00 | 1920.00 1926.00 | //: 64-bit, MSC_v9.00 [asm=...]
+ 2_ || 336.00 336.00 | 534.00 534.00 | 963.00 963.00 | //: 64-bit, MSC_v9.00 [asm=...]
+ 4_ || 166.50 168.00 | 267.00 267.00 | 481.50 483.00 | //: 64-bit, MSC_v9.00 [asm=...]
+ 8_ || 81.00 81.00 | 130.50 131.25 | 240.00 240.75 | //: 64-bit, MSC_v9.00 [asm=...]
+ 10_ || 64.80 65.40 | 107.40 108.00 | 192.00 192.60 | //: 64-bit, MSC_v9.00 [asm=...]
+ 16_ || 40.13 40.13 | 65.63 65.63 | 120.00 120.00 | //: 64-bit, MSC_v9.00 [asm=...]
+ 32_ || 20.06 20.06 | 32.81 32.81 | 59.63 59.81 | //: 64-bit, MSC_v9.00 [asm=...]
+ 64_ || 14.25 14.34 | 16.31 16.31 | 32.44 32.44 | //: 64-bit, MSC_v9.00 [asm=...]
+ 100_ || 15.54 15.60 | 16.20 16.26 | 21.06 21.06 | //: 64-bit, MSC_v9.00 [asm=...]
+ 128_ || 11.81 11.86 | 11.44 11.48 | 14.86 14.86 | //: 64-bit, MSC_v9.00 [asm=...]
+ 256_ || 9.28 9.28 | 8.81 8.81 | 10.83 10.83 | //: 64-bit, MSC_v9.00 [asm=...]
+ 512_ || 8.43 8.43 | 7.46 7.46 | 8.66 8.66 | //: 64-bit, MSC_v9.00 [asm=...]
+ 1000_ || 8.18 8.18 | 6.97 6.97 | 7.77 7.78 | //: 64-bit, MSC_v9.00 [asm=...]
+ 1024_ || 7.98 8.50 | 6.81 7.38 | 7.58 7.58 | //: 64-bit, MSC_v9.00 [asm=...]
+ 2048_ || 7.75 7.75 | 6.47 6.47 | 7.05 7.05 | //: 64-bit, MSC_v9.00 [asm=...]
+ 4096_ || 7.65 7.65 | 6.30 6.30 | 6.78 6.78 | //: 64-bit, MSC_v9.00 [asm=...]
+ 8192_ || 7.59 7.59 | 6.21 6.21 | 6.64 6.64 | //: 64-bit, MSC_v9.00 [asm=...]
+ 10000_ || 7.59 7.59 | 6.23 6.23 | 6.69 6.69 | //: 64-bit, MSC_v9.00 [asm=...]
+ 16384_ || 7.57 7.57 | 6.17 6.17 | 6.57 6.57 | //: 64-bit, MSC_v9.00 [asm=...]
+ 32768_ || 7.55 7.56 | 6.15 6.15 | 6.53 6.55 | //: 64-bit, MSC_v9.00 [asm=...]
+ 100000_ || 7.55 7.71 | 6.14 6.38 | 6.56 6.86 | //: 64-bit, MSC_v9.00 [asm=...]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+ Block || 2323 bytes | 4733 bytes | 11817 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:24,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 4728.00 4728.00 | 8352.00 8352.00 | 20034.00 20040.00 | //: 32-bit, GCC_v3.42 [ C =111]
+ 2_ || 2370.00 2370.00 | 4179.00 4179.00 | 9261.00 9264.00 | //: 32-bit, GCC_v3.42 [ C =111]
+ 4_ || 1092.00 1096.50 | 1924.50 1926.00 | 4624.50 4624.50 | //: 32-bit, GCC_v3.42 [ C =111]
+ 8_ || 544.50 545.25 | 1040.25 1047.75 | 2312.25 2313.00 | //: 32-bit, GCC_v3.42 [ C =111]
+ 10_ || 436.20 436.80 | 768.60 769.20 | 1852.20 1852.20 | //: 32-bit, GCC_v3.42 [ C =111]
+ 16_ || 272.63 273.00 | 480.38 519.38 | 1156.88 1157.25 | //: 32-bit, GCC_v3.42 [ C =111]
+ 32_ || 135.94 135.94 | 240.56 240.75 | 579.00 579.00 | //: 32-bit, GCC_v3.42 [ C =111]
+ 64_ || 100.88 101.53 | 129.75 129.84 | 289.59 289.69 | //: 32-bit, GCC_v3.42 [ C =111]
+ 100_ || 106.44 106.44 | 113.94 114.18 | 185.46 200.94 | //: 32-bit, GCC_v3.42 [ C =111]
+ 128_ || 83.06 83.06 | 89.11 89.11 | 144.61 144.61 | //: 32-bit, GCC_v3.42 [ C =111]
+ 256_ || 73.83 79.99 | 73.34 79.45 | 107.55 107.55 | //: 32-bit, GCC_v3.42 [ C =111]
+ 512_ || 69.16 69.18 | 65.32 65.39 | 88.89 88.92 | //: 32-bit, GCC_v3.42 [ C =111]
+ 1000_ || 68.45 68.45 | 62.84 62.92 | 81.36 81.38 | //: 32-bit, GCC_v3.42 [ C =111]
+ 1024_ || 66.83 66.86 | 61.34 61.34 | 79.42 79.43 | //: 32-bit, GCC_v3.42 [ C =111]
+ 2048_ || 65.67 65.73 | 59.33 59.33 | 74.70 74.71 | //: 32-bit, GCC_v3.42 [ C =111]
+ 4096_ || 65.08 65.15 | 58.33 58.33 | 72.33 72.34 | //: 32-bit, GCC_v3.42 [ C =111]
+ 8192_ || 65.76 70.08 | 62.66 62.66 | 77.08 77.15 | //: 32-bit, GCC_v3.42 [ C =111]
+ 10000_ || 70.01 70.33 | 62.84 62.92 | 77.70 77.70 | //: 32-bit, GCC_v3.42 [ C =111]
+ 16384_ || 69.93 70.32 | 62.63 62.71 | 72.64 72.73 | //: 32-bit, GCC_v3.42 [ C =111]
+ 32768_ || 69.31 69.90 | 58.90 59.54 | 73.37 76.24 | //: 32-bit, GCC_v3.42 [ C =111]
+ 100000_ || 67.54 70.40 | 59.09 59.39 | 72.65 73.26 | //: 32-bit, GCC_v3.42 [ C =111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [ C =111]
+ Block || 2928 bytes | 5568 bytes | 11712 bytes | //: 32-bit, GCC_v3.42 [ C =111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:31,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2952.00 2958.00 | 6030.00 6036.00 | 13668.00 13674.00 | //: 32-bit, MSC_v9.00 [ C =111]
+ 2_ || 1476.00 1476.00 | 3015.00 3015.00 | 6831.00 6834.00 | //: 32-bit, MSC_v9.00 [ C =111]
+ 4_ || 738.00 739.50 | 1507.50 1507.50 | 3415.50 3415.50 | //: 32-bit, MSC_v9.00 [ C =111]
+ 8_ || 369.00 369.75 | 751.50 751.50 | 1707.00 1707.00 | //: 32-bit, MSC_v9.00 [ C =111]
+ 10_ || 295.80 295.80 | 603.00 603.60 | 1366.80 1366.80 | //: 32-bit, MSC_v9.00 [ C =111]
+ 16_ || 184.88 185.25 | 376.50 376.50 | 855.38 855.38 | //: 32-bit, MSC_v9.00 [ C =111]
+ 32_ || 91.31 91.50 | 188.44 188.63 | 427.50 427.50 | //: 32-bit, MSC_v9.00 [ C =111]
+ 64_ || 66.56 66.66 | 93.84 93.84 | 213.56 213.66 | //: 32-bit, MSC_v9.00 [ C =111]
+ 100_ || 69.96 70.02 | 88.98 89.04 | 136.92 137.52 | //: 32-bit, MSC_v9.00 [ C =111]
+ 128_ || 54.14 54.23 | 69.52 69.75 | 106.69 106.88 | //: 32-bit, MSC_v9.00 [ C =111]
+ 256_ || 47.70 47.77 | 57.12 57.19 | 79.24 79.29 | //: 32-bit, MSC_v9.00 [ C =111]
+ 512_ || 44.46 44.54 | 50.75 50.81 | 65.52 65.55 | //: 32-bit, MSC_v9.00 [ C =111]
+ 1000_ || 43.90 43.96 | 48.78 48.85 | 60.08 60.11 | //: 32-bit, MSC_v9.00 [ C =111]
+ 1024_ || 42.83 42.87 | 47.44 47.65 | 58.49 58.51 | //: 32-bit, MSC_v9.00 [ C =111]
+ 2048_ || 42.17 42.17 | 45.83 45.83 | 55.01 55.16 | //: 32-bit, MSC_v9.00 [ C =111]
+ 4096_ || 41.76 41.76 | 45.02 45.03 | 53.27 53.44 | //: 32-bit, MSC_v9.00 [ C =111]
+ 8192_ || 38.35 41.55 | 41.20 41.26 | 48.54 51.37 | //: 32-bit, MSC_v9.00 [ C =111]
+ 10000_ || 41.53 41.59 | 44.76 44.80 | 53.01 53.01 | //: 32-bit, MSC_v9.00 [ C =111]
+ 16384_ || 41.38 41.69 | 44.43 44.47 | 52.00 52.07 | //: 32-bit, MSC_v9.00 [ C =111]
+ 32768_ || 41.36 41.38 | 44.43 44.44 | 51.86 52.07 | //: 32-bit, MSC_v9.00 [ C =111]
+ 100000_ || 41.32 41.60 | 44.52 44.62 | 51.75 51.92 | //: 32-bit, MSC_v9.00 [ C =111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+ Block || 1712 bytes | 3664 bytes | 7200 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:36,Oct 7 2008 by 'MSC_v6.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 9150.00 9156.00 | 16794.00 16836.00 | 36456.00 37386.00 | //: 32-bit, MSC_v6.00 [ C =111]
+ 2_ || 4335.00 4335.00 | 8244.00 8352.00 | 18156.00 18246.00 | //: 32-bit, MSC_v6.00 [ C =111]
+ 4_ || 2167.50 2167.50 | 4117.50 4201.50 | 9031.50 9060.00 | //: 32-bit, MSC_v6.00 [ C =111]
+ 8_ || 1083.00 1083.75 | 2122.50 2125.50 | 4515.00 4611.00 | //: 32-bit, MSC_v6.00 [ C =111]
+ 10_ || 874.80 874.80 | 1683.60 1695.60 | 3621.00 3705.00 | //: 32-bit, MSC_v6.00 [ C =111]
+ 16_ || 541.50 541.88 | 1041.38 1042.50 | 2268.38 2274.00 | //: 32-bit, MSC_v6.00 [ C =111]
+ 32_ || 271.88 272.25 | 515.63 526.13 | 1133.81 1139.06 | //: 32-bit, MSC_v6.00 [ C =111]
+ 64_ || 201.00 201.09 | 259.59 263.72 | 567.47 569.34 | //: 32-bit, MSC_v6.00 [ C =111]
+ 100_ || 211.92 211.98 | 250.32 251.10 | 363.06 363.60 | //: 32-bit, MSC_v6.00 [ C =111]
+ 128_ || 166.78 167.11 | 196.73 198.28 | 283.45 284.20 | //: 32-bit, MSC_v6.00 [ C =111]
+ 256_ || 147.94 147.94 | 160.57 160.71 | 212.18 212.72 | //: 32-bit, MSC_v6.00 [ C =111]
+ 512_ || 139.32 139.37 | 143.68 143.70 | 175.95 176.36 | //: 32-bit, MSC_v6.00 [ C =111]
+ 1000_ || 138.17 138.18 | 140.23 140.80 | 168.46 168.46 | //: 32-bit, MSC_v6.00 [ C =111]
+ 1024_ || 134.92 134.92 | 135.90 136.72 | 164.48 164.48 | //: 32-bit, MSC_v6.00 [ C =111]
+ 2048_ || 132.76 132.76 | 132.19 132.25 | 154.34 155.67 | //: 32-bit, MSC_v6.00 [ C =111]
+ 4096_ || 131.66 131.74 | 132.76 133.34 | 149.64 150.49 | //: 32-bit, MSC_v6.00 [ C =111]
+ 8192_ || 131.21 135.88 | 120.29 124.46 | 142.16 147.73 | //: 32-bit, MSC_v6.00 [ C =111]
+ 10000_ || 124.18 125.11 | 123.38 125.46 | 139.12 140.88 | //: 32-bit, MSC_v6.00 [ C =111]
+ 16384_ || 124.27 130.94 | 122.04 127.55 | 137.91 146.32 | //: 32-bit, MSC_v6.00 [ C =111]
+ 32768_ || 123.57 128.76 | 120.72 121.97 | 138.10 140.89 | //: 32-bit, MSC_v6.00 [ C =111]
+ 100000_ || 123.30 129.21 | 123.83 125.61 | 145.19 145.40 | //: 32-bit, MSC_v6.00 [ C =111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1486 bytes | 1348 bytes | 1445 bytes | //: 32-bit, MSC_v6.00 [ C =111]
+ Block || 2435 bytes | 5119 bytes | 8894 bytes | //: 32-bit, MSC_v6.00 [ C =111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:48,Oct 7 2008 by 'MSC_v4.20', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 5418.00 5430.00 | 9450.00 9462.00 | 20436.00 20472.00 | //: 32-bit, MSC_v4.20 [ C =111]
+ 2_ || 2709.00 2721.00 | 4725.00 4731.00 | 10212.00 10245.00 | //: 32-bit, MSC_v4.20 [ C =111]
+ 4_ || 1351.50 1354.50 | 2359.50 2361.00 | 5097.00 5107.50 | //: 32-bit, MSC_v4.20 [ C =111]
+ 8_ || 675.00 678.75 | 1179.75 1179.75 | 2549.25 2552.25 | //: 32-bit, MSC_v4.20 [ C =111]
+ 10_ || 540.60 546.60 | 943.20 944.40 | 2041.20 2041.80 | //: 32-bit, MSC_v4.20 [ C =111]
+ 16_ || 337.88 338.25 | 589.50 589.50 | 1273.88 1275.38 | //: 32-bit, MSC_v4.20 [ C =111]
+ 32_ || 167.81 167.81 | 294.94 295.13 | 636.75 637.13 | //: 32-bit, MSC_v4.20 [ C =111]
+ 64_ || 124.41 124.41 | 147.19 147.84 | 318.28 318.47 | //: 32-bit, MSC_v4.20 [ C =111]
+ 100_ || 131.46 131.52 | 140.10 140.28 | 203.76 203.94 | //: 32-bit, MSC_v4.20 [ C =111]
+ 128_ || 102.42 102.47 | 109.22 109.41 | 159.05 159.38 | //: 32-bit, MSC_v4.20 [ C =111]
+ 256_ || 91.10 91.27 | 90.59 90.59 | 118.73 118.78 | //: 32-bit, MSC_v4.20 [ C =111]
+ 512_ || 85.43 85.43 | 80.78 80.79 | 98.43 98.48 | //: 32-bit, MSC_v4.20 [ C =111]
+ 1000_ || 84.56 84.56 | 77.74 77.75 | 90.24 90.28 | //: 32-bit, MSC_v4.20 [ C =111]
+ 1024_ || 82.55 82.55 | 75.83 75.83 | 88.15 88.19 | //: 32-bit, MSC_v4.20 [ C =111]
+ 2048_ || 81.07 81.07 | 73.35 73.36 | 83.00 83.02 | //: 32-bit, MSC_v4.20 [ C =111]
+ 4096_ || 80.34 80.36 | 72.12 72.13 | 80.42 80.44 | //: 32-bit, MSC_v4.20 [ C =111]
+ 8192_ || 79.97 80.54 | 71.56 71.64 | 79.11 79.62 | //: 32-bit, MSC_v4.20 [ C =111]
+ 10000_ || 75.11 80.03 | 66.25 69.37 | 73.59 74.99 | //: 32-bit, MSC_v4.20 [ C =111]
+ 16384_ || 75.57 80.04 | 67.66 71.51 | 74.32 74.42 | //: 32-bit, MSC_v4.20 [ C =111]
+ 32768_ || 75.61 80.15 | 67.03 67.84 | 74.04 78.41 | //: 32-bit, MSC_v4.20 [ C =111]
+ 100000_ || 77.96 80.31 | 67.58 67.84 | 74.31 74.73 | //: 32-bit, MSC_v4.20 [ C =111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1152 bytes | 1024 bytes | 1088 bytes | //: 32-bit, MSC_v4.20 [ C =111]
+ Block || 2064 bytes | 3840 bytes | 7616 bytes | //: 32-bit, MSC_v4.20 [ C =111]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:29:54,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 780.00 786.00 | 1422.00 1434.00 | 3810.00 3816.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 2_ || 384.00 390.00 | 705.00 708.00 | 1902.00 1902.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 4_ || 193.50 193.50 | 355.50 355.50 | 951.00 952.50 | //: 64-bit, MSC_v9.00 [ C =111]
+ 8_ || 93.75 93.75 | 171.00 171.75 | 474.75 475.50 | //: 64-bit, MSC_v9.00 [ C =111]
+ 10_ || 75.60 76.20 | 140.40 140.40 | 380.40 381.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 16_ || 51.38 51.38 | 93.00 93.00 | 257.25 257.63 | //: 64-bit, MSC_v9.00 [ C =111]
+ 32_ || 25.31 25.31 | 46.50 46.50 | 118.69 118.69 | //: 64-bit, MSC_v9.00 [ C =111]
+ 64_ || 16.69 16.69 | 21.38 21.38 | 59.53 59.53 | //: 64-bit, MSC_v9.00 [ C =111]
+ 100_ || 17.16 17.22 | 20.52 21.00 | 38.22 38.28 | //: 64-bit, MSC_v9.00 [ C =111]
+ 128_ || 13.27 13.27 | 15.80 15.80 | 29.63 29.67 | //: 64-bit, MSC_v9.00 [ C =111]
+ 256_ || 11.16 11.18 | 12.61 12.73 | 22.10 22.10 | //: 64-bit, MSC_v9.00 [ C =111]
+ 512_ || 10.05 10.07 | 11.00 11.07 | 18.18 19.68 | //: 64-bit, MSC_v9.00 [ C =111]
+ 1000_ || 9.69 9.69 | 10.42 10.42 | 16.51 16.51 | //: 64-bit, MSC_v9.00 [ C =111]
+ 1024_ || 9.44 9.44 | 10.18 10.18 | 16.11 16.12 | //: 64-bit, MSC_v9.00 [ C =111]
+ 2048_ || 9.21 9.21 | 9.62 9.62 | 15.06 15.06 | //: 64-bit, MSC_v9.00 [ C =111]
+ 4096_ || 9.10 9.10 | 9.36 9.37 | 14.55 14.55 | //: 64-bit, MSC_v9.00 [ C =111]
+ 8192_ || 8.97 8.97 | 9.20 9.21 | 14.48 14.66 | //: 64-bit, MSC_v9.00 [ C =111]
+ 10000_ || 8.97 8.97 | 9.38 9.38 | 14.38 14.40 | //: 64-bit, MSC_v9.00 [ C =111]
+ 16384_ || 8.95 9.01 | 9.26 9.26 | 14.16 14.29 | //: 64-bit, MSC_v9.00 [ C =111]
+ 32768_ || 8.90 9.24 | 9.18 9.18 | 14.46 14.75 | //: 64-bit, MSC_v9.00 [ C =111]
+ 100000_ || 9.18 9.71 | 9.35 9.49 | 14.79 14.99 | //: 64-bit, MSC_v9.00 [ C =111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+ Block || 704 bytes | 1456 bytes | 2976 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:29:57,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 6420.00 6420.00 | 11040.00 11040.00 | 23358.00 23364.00 | //: 32-bit, BCC_v5.51 [ C =111]
+ 2_ || 3210.00 3210.00 | 5517.00 5520.00 | 11679.00 11682.00 | //: 32-bit, BCC_v5.51 [ C =111]
+ 4_ || 1605.00 1605.00 | 2758.50 2758.50 | 5832.00 5833.50 | //: 32-bit, BCC_v5.51 [ C =111]
+ 8_ || 802.50 802.50 | 1379.25 1379.25 | 2916.00 2916.75 | //: 32-bit, BCC_v5.51 [ C =111]
+ 10_ || 642.00 642.00 | 1103.40 1103.40 | 2335.80 2335.80 | //: 32-bit, BCC_v5.51 [ C =111]
+ 16_ || 400.88 401.25 | 689.25 689.62 | 1458.00 1458.00 | //: 32-bit, BCC_v5.51 [ C =111]
+ 32_ || 199.50 199.50 | 344.44 344.44 | 729.00 729.00 | //: 32-bit, BCC_v5.51 [ C =111]
+ 64_ || 146.06 146.25 | 171.66 172.50 | 364.41 364.50 | //: 32-bit, BCC_v5.51 [ C =111]
+ 100_ || 152.28 152.28 | 162.78 162.78 | 233.16 233.16 | //: 32-bit, BCC_v5.51 [ C =111]
+ 128_ || 118.69 118.69 | 126.89 126.89 | 181.88 181.88 | //: 32-bit, BCC_v5.51 [ C =111]
+ 256_ || 104.62 104.62 | 104.48 104.48 | 135.30 135.33 | //: 32-bit, BCC_v5.51 [ C =111]
+ 512_ || 97.50 97.50 | 93.13 93.14 | 112.00 112.00 | //: 32-bit, BCC_v5.51 [ C =111]
+ 1000_ || 96.26 96.26 | 89.53 89.54 | 102.70 102.71 | //: 32-bit, BCC_v5.51 [ C =111]
+ 1024_ || 93.91 93.91 | 87.40 87.40 | 100.27 100.27 | //: 32-bit, BCC_v5.51 [ C =111]
+ 2048_ || 92.14 92.14 | 84.56 84.56 | 94.38 94.39 | //: 32-bit, BCC_v5.51 [ C =111]
+ 4096_ || 91.28 91.28 | 76.72 83.12 | 84.42 86.14 | //: 32-bit, BCC_v5.51 [ C =111]
+ 8192_ || 83.85 86.88 | 76.06 80.17 | 83.06 87.27 | //: 32-bit, BCC_v5.51 [ C =111]
+ 10000_ || 83.92 87.25 | 76.30 83.56 | 86.42 87.19 | //: 32-bit, BCC_v5.51 [ C =111]
+ 16384_ || 85.71 87.12 | 77.78 77.82 | 84.43 84.51 | //: 32-bit, BCC_v5.51 [ C =111]
+ 32768_ || 85.60 86.59 | 77.64 78.17 | 84.32 84.94 | //: 32-bit, BCC_v5.51 [ C =111]
+ 100000_ || 86.18 87.75 | 78.03 79.63 | 84.77 88.79 | //: 32-bit, BCC_v5.51 [ C =111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [ C =111]
+ Block || 1888 bytes | 3028 bytes | 5864 bytes | //: 32-bit, BCC_v5.51 [ C =111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:30:04,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2664.00 2664.00 | 4998.00 4998.00 | 10704.00 10704.00 | //: 32-bit, BCC_v5.51 [asm=111]
+ 2_ || 1338.00 1338.00 | 2505.00 2508.00 | 5352.00 5352.00 | //: 32-bit, BCC_v5.51 [asm=111]
+ 4_ || 669.00 669.00 | 1246.50 1246.50 | 2668.50 2670.00 | //: 32-bit, BCC_v5.51 [asm=111]
+ 8_ || 334.50 334.50 | 623.25 623.25 | 1334.25 1334.25 | //: 32-bit, BCC_v5.51 [asm=111]
+ 10_ || 266.40 266.40 | 501.00 501.00 | 1058.40 1058.40 | //: 32-bit, BCC_v5.51 [asm=111]
+ 16_ || 166.50 166.50 | 312.75 321.00 | 628.50 629.25 | //: 32-bit, BCC_v5.51 [asm=111]
+ 32_ || 79.88 79.88 | 147.75 147.75 | 312.19 312.38 | //: 32-bit, BCC_v5.51 [asm=111]
+ 64_ || 56.53 56.53 | 73.22 73.22 | 156.09 156.09 | //: 32-bit, BCC_v5.51 [asm=111]
+ 100_ || 58.08 58.08 | 68.52 74.10 | 99.36 107.52 | //: 32-bit, BCC_v5.51 [asm=111]
+ 128_ || 45.19 45.23 | 53.20 53.20 | 77.81 77.81 | //: 32-bit, BCC_v5.51 [asm=111]
+ 256_ || 39.26 39.28 | 43.24 43.24 | 57.52 62.32 | //: 32-bit, BCC_v5.51 [asm=111]
+ 512_ || 36.13 36.13 | 37.76 37.77 | 47.17 47.24 | //: 32-bit, BCC_v5.51 [asm=111]
+ 1000_ || 35.51 35.71 | 36.22 36.23 | 42.92 43.04 | //: 32-bit, BCC_v5.51 [asm=111]
+ 1024_ || 34.51 34.51 | 34.78 35.12 | 42.05 42.05 | //: 32-bit, BCC_v5.51 [asm=111]
+ 2048_ || 33.69 33.70 | 33.82 33.83 | 38.84 39.04 | //: 32-bit, BCC_v5.51 [asm=111]
+ 4096_ || 32.01 33.99 | 33.64 33.64 | 37.82 37.97 | //: 32-bit, BCC_v5.51 [asm=111]
+ 8192_ || 31.77 32.58 | 32.80 33.00 | 36.98 37.59 | //: 32-bit, BCC_v5.51 [asm=111]
+ 10000_ || 33.75 33.75 | 33.13 33.25 | 37.32 37.86 | //: 32-bit, BCC_v5.51 [asm=111]
+ 16384_ || 31.90 36.52 | 35.86 35.90 | 37.26 40.33 | //: 32-bit, BCC_v5.51 [asm=111]
+ 32768_ || 34.29 34.47 | 33.87 34.03 | 37.77 38.04 | //: 32-bit, BCC_v5.51 [asm=111]
+ 100000_ || 33.20 34.48 | 33.75 33.91 | 37.98 38.23 | //: 32-bit, BCC_v5.51 [asm=111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [asm=111]
+ Block || 1276 bytes | 2532 bytes | 4983 bytes | //: 32-bit, BCC_v5.51 [asm=111]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:30:08,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2580.00 2598.00 | 4842.00 4848.00 | 10578.00 10602.00 | //: 32-bit, MSC_v9.00 [asm=111]
+ 2_ || 1299.00 1302.00 | 2445.00 2445.00 | 5277.00 5283.00 | //: 32-bit, MSC_v9.00 [asm=111]
+ 4_ || 648.00 648.00 | 1213.50 1215.00 | 2644.50 2649.00 | //: 32-bit, MSC_v9.00 [asm=111]
+ 8_ || 324.00 324.75 | 610.50 610.50 | 1322.25 1323.00 | //: 32-bit, MSC_v9.00 [asm=111]
+ 10_ || 259.80 259.80 | 484.20 484.20 | 1059.60 1060.20 | //: 32-bit, MSC_v9.00 [asm=111]
+ 16_ || 162.00 162.38 | 302.63 302.63 | 660.38 662.63 | //: 32-bit, MSC_v9.00 [asm=111]
+ 32_ || 80.81 81.00 | 141.56 141.56 | 308.63 308.63 | //: 32-bit, MSC_v9.00 [asm=111]
+ 64_ || 54.38 54.47 | 70.41 70.41 | 154.41 154.59 | //: 32-bit, MSC_v9.00 [asm=111]
+ 100_ || 57.18 57.24 | 66.42 66.48 | 98.40 98.46 | //: 32-bit, MSC_v9.00 [asm=111]
+ 128_ || 48.28 48.28 | 51.75 51.75 | 76.97 77.02 | //: 32-bit, MSC_v9.00 [asm=111]
+ 256_ || 39.05 39.05 | 42.45 42.47 | 56.95 56.95 | //: 32-bit, MSC_v9.00 [asm=111]
+ 512_ || 36.09 36.11 | 37.65 37.66 | 47.05 47.06 | //: 32-bit, MSC_v9.00 [asm=111]
+ 1000_ || 35.56 35.59 | 35.96 35.96 | 42.79 42.80 | //: 32-bit, MSC_v9.00 [asm=111]
+ 1024_ || 34.62 34.63 | 35.28 35.28 | 41.47 41.47 | //: 32-bit, MSC_v9.00 [asm=111]
+ 2048_ || 33.91 33.91 | 34.00 34.08 | 39.33 39.33 | //: 32-bit, MSC_v9.00 [asm=111]
+ 4096_ || 33.38 33.66 | 33.49 33.49 | 38.04 38.23 | //: 32-bit, MSC_v9.00 [asm=111]
+ 8192_ || 33.15 33.23 | 32.76 33.07 | 37.21 37.22 | //: 32-bit, MSC_v9.00 [asm=111]
+ 10000_ || 33.69 36.50 | 33.29 33.42 | 37.98 41.34 | //: 32-bit, MSC_v9.00 [asm=111]
+ 16384_ || 33.07 35.17 | 33.08 34.97 | 37.10 38.12 | //: 32-bit, MSC_v9.00 [asm=111]
+ 32768_ || 34.35 34.53 | 33.80 34.05 | 38.21 40.46 | //: 32-bit, MSC_v9.00 [asm=111]
+ 100000_ || 33.96 34.57 | 33.93 35.69 | 38.04 38.20 | //: 32-bit, MSC_v9.00 [asm=111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+ Block || 1276 bytes | 2532 bytes | 4983 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:30:13,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2514.00 2514.00 | 4836.00 4836.00 | 10392.00 10398.00 | //: 32-bit, GCC_v3.42 [asm=111]
+ 2_ || 1254.00 1260.00 | 2409.00 2412.00 | 5181.00 5184.00 | //: 32-bit, GCC_v3.42 [asm=111]
+ 4_ || 628.50 628.50 | 1204.50 1204.50 | 2596.50 2598.00 | //: 32-bit, GCC_v3.42 [asm=111]
+ 8_ || 312.75 312.75 | 602.25 603.00 | 1298.25 1299.00 | //: 32-bit, GCC_v3.42 [asm=111]
+ 10_ || 250.80 251.40 | 482.40 483.00 | 1035.00 1035.60 | //: 32-bit, GCC_v3.42 [asm=111]
+ 16_ || 157.50 157.50 | 302.25 302.63 | 652.50 652.50 | //: 32-bit, GCC_v3.42 [asm=111]
+ 32_ || 78.19 78.38 | 151.88 152.06 | 326.81 326.81 | //: 32-bit, GCC_v3.42 [asm=111]
+ 64_ || 57.09 57.19 | 75.47 75.47 | 163.31 163.41 | //: 32-bit, GCC_v3.42 [asm=111]
+ 100_ || 60.06 60.06 | 71.22 71.28 | 104.58 104.58 | //: 32-bit, GCC_v3.42 [asm=111]
+ 128_ || 46.83 46.88 | 55.45 55.50 | 81.33 81.38 | //: 32-bit, GCC_v3.42 [asm=111]
+ 256_ || 41.32 41.34 | 45.47 45.49 | 59.91 59.91 | //: 32-bit, GCC_v3.42 [asm=111]
+ 512_ || 38.51 38.52 | 40.16 40.16 | 49.49 49.49 | //: 32-bit, GCC_v3.42 [asm=111]
+ 1000_ || 37.92 37.93 | 38.60 38.60 | 45.40 45.41 | //: 32-bit, GCC_v3.42 [asm=111]
+ 1024_ || 37.08 37.08 | 37.93 38.33 | 45.24 45.25 | //: 32-bit, GCC_v3.42 [asm=111]
+ 2048_ || 36.55 36.56 | 36.88 36.88 | 42.42 42.49 | //: 32-bit, GCC_v3.42 [asm=111]
+ 4096_ || 35.77 35.77 | 33.56 37.02 | 37.73 39.65 | //: 32-bit, GCC_v3.42 [asm=111]
+ 8192_ || 32.68 34.17 | 33.13 33.19 | 38.41 40.47 | //: 32-bit, GCC_v3.42 [asm=111]
+ 10000_ || 35.92 36.59 | 35.00 36.14 | 37.65 39.24 | //: 32-bit, GCC_v3.42 [asm=111]
+ 16384_ || 33.37 34.20 | 32.77 33.93 | 36.86 37.94 | //: 32-bit, GCC_v3.42 [asm=111]
+ 32768_ || 34.22 34.41 | 33.82 34.06 | 37.39 37.74 | //: 32-bit, GCC_v3.42 [asm=111]
+ 100000_ || 34.23 34.34 | 33.81 34.20 | 37.34 37.86 | //: 32-bit, GCC_v3.42 [asm=111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [asm=111]
+ Block || 1276 bytes | 2532 bytes | 4983 bytes | //: 32-bit, GCC_v3.42 [asm=111]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:30:17,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 678.00 678.00 | 1098.00 1098.00 | 2034.00 2040.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 2_ || 339.00 339.00 | 546.00 546.00 | 1017.00 1020.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 4_ || 168.00 169.50 | 273.00 273.00 | 510.00 511.50 | //: 64-bit, MSC_v9.00 [asm=111]
+ 8_ || 81.75 82.50 | 134.25 134.25 | 254.25 255.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 10_ || 66.60 66.60 | 109.80 109.80 | 204.00 204.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 16_ || 40.88 40.88 | 66.75 67.13 | 127.50 127.50 | //: 64-bit, MSC_v9.00 [asm=111]
+ 32_ || 20.25 20.44 | 33.56 33.56 | 63.56 63.56 | //: 64-bit, MSC_v9.00 [asm=111]
+ 64_ || 14.91 15.00 | 16.50 16.50 | 31.69 31.69 | //: 64-bit, MSC_v9.00 [asm=111]
+ 100_ || 15.48 16.68 | 16.98 16.98 | 22.38 22.38 | //: 64-bit, MSC_v9.00 [asm=111]
+ 128_ || 12.80 12.80 | 12.94 12.94 | 15.84 15.89 | //: 64-bit, MSC_v9.00 [asm=111]
+ 256_ || 9.84 9.84 | 9.33 9.33 | 11.60 11.63 | //: 64-bit, MSC_v9.00 [asm=111]
+ 512_ || 8.75 8.79 | 8.53 8.57 | 9.36 9.38 | //: 64-bit, MSC_v9.00 [asm=111]
+ 1000_ || 8.45 8.45 | 7.93 7.93 | 8.39 8.39 | //: 64-bit, MSC_v9.00 [asm=111]
+ 1024_ || 8.25 8.25 | 7.14 7.14 | 8.19 8.19 | //: 64-bit, MSC_v9.00 [asm=111]
+ 2048_ || 8.00 8.00 | 6.77 7.33 | 7.58 7.58 | //: 64-bit, MSC_v9.00 [asm=111]
+ 4096_ || 7.88 7.88 | 6.58 6.58 | 7.29 7.29 | //: 64-bit, MSC_v9.00 [asm=111]
+ 8192_ || 7.81 7.81 | 6.49 6.49 | 7.13 7.15 | //: 64-bit, MSC_v9.00 [asm=111]
+ 10000_ || 7.81 7.81 | 6.50 6.50 | 7.18 7.18 | //: 64-bit, MSC_v9.00 [asm=111]
+ 16384_ || 7.79 7.79 | 6.42 6.42 | 7.04 7.04 | //: 64-bit, MSC_v9.00 [asm=111]
+ 32768_ || 7.77 7.77 | 6.40 6.40 | 7.03 7.03 | //: 64-bit, MSC_v9.00 [asm=111]
+ 100000_ || 8.08 8.09 | 6.40 6.71 | 6.98 7.21 | //: 64-bit, MSC_v9.00 [asm=111]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+ Block || 664 bytes | 1074 bytes | 2221 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:30:19,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 4272.00 4296.00 | 7974.00 7980.00 | 17484.00 17496.00 | //: 32-bit, GCC_v3.42 [ C =332]
+ 2_ || 2139.00 2154.00 | 3981.00 3996.00 | 8736.00 8754.00 | //: 32-bit, GCC_v3.42 [ C =332]
+ 4_ || 1069.50 1071.00 | 1995.00 2002.50 | 4377.00 4378.50 | //: 32-bit, GCC_v3.42 [ C =332]
+ 8_ || 536.25 538.50 | 998.25 1000.50 | 2183.25 2186.25 | //: 32-bit, GCC_v3.42 [ C =332]
+ 10_ || 429.00 430.20 | 798.60 807.60 | 1749.60 1752.00 | //: 32-bit, GCC_v3.42 [ C =332]
+ 16_ || 267.75 270.00 | 498.00 499.88 | 1092.00 1093.13 | //: 32-bit, GCC_v3.42 [ C =332]
+ 32_ || 132.75 133.50 | 249.19 249.75 | 546.38 547.50 | //: 32-bit, GCC_v3.42 [ C =332]
+ 64_ || 98.44 99.00 | 123.94 124.03 | 272.25 272.34 | //: 32-bit, GCC_v3.42 [ C =332]
+ 100_ || 103.08 103.08 | 117.96 117.96 | 174.24 174.42 | //: 32-bit, GCC_v3.42 [ C =332]
+ 128_ || 80.72 121.13 | 92.34 133.22 | 137.06 137.39 | //: 32-bit, GCC_v3.42 [ C =332]
+ 256_ || 71.91 72.21 | 75.84 76.01 | 101.93 102.09 | //: 32-bit, GCC_v3.42 [ C =332]
+ 512_ || 67.50 67.59 | 67.62 67.75 | 83.95 84.47 | //: 32-bit, GCC_v3.42 [ C =332]
+ 1000_ || 66.71 67.00 | 64.95 65.28 | 77.12 77.20 | //: 32-bit, GCC_v3.42 [ C =332]
+ 1024_ || 64.89 64.96 | 63.19 63.23 | 74.67 74.67 | //: 32-bit, GCC_v3.42 [ C =332]
+ 2048_ || 63.35 63.36 | 61.13 61.14 | 70.19 70.19 | //: 32-bit, GCC_v3.42 [ C =332]
+ 4096_ || 62.80 62.80 | 60.11 60.12 | 62.58 62.65 | //: 32-bit, GCC_v3.42 [ C =332]
+ 8192_ || 57.83 59.01 | 55.02 60.12 | 66.75 66.86 | //: 32-bit, GCC_v3.42 [ C =332]
+ 10000_ || 62.69 62.87 | 59.76 59.87 | 67.20 67.63 | //: 32-bit, GCC_v3.42 [ C =332]
+ 16384_ || 62.50 62.75 | 55.96 59.35 | 62.39 63.28 | //: 32-bit, GCC_v3.42 [ C =332]
+ 32768_ || 58.66 59.69 | 56.17 56.62 | 61.97 63.07 | //: 32-bit, GCC_v3.42 [ C =332]
+ 100000_ || 59.31 59.99 | 56.30 57.46 | 62.79 63.27 | //: 32-bit, GCC_v3.42 [ C =332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [ C =332]
+ Block || 6640 bytes | 13040 bytes | 18448 bytes | //: 32-bit, GCC_v3.42 [ C =332]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:30:25,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2988.00 2994.00 | 6240.00 6246.00 | 13794.00 13800.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 2_ || 1488.00 1503.00 | 3120.00 3126.00 | 6900.00 6903.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 4_ || 744.00 751.50 | 1560.00 1560.00 | 3445.50 3447.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 8_ || 372.00 372.75 | 777.75 779.25 | 1723.50 1723.50 | //: 32-bit, MSC_v9.00 [ C =332]
+ 10_ || 297.60 299.40 | 623.40 624.00 | 1379.40 1380.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 16_ || 186.38 186.38 | 389.25 389.63 | 861.75 861.75 | //: 32-bit, MSC_v9.00 [ C =332]
+ 32_ || 92.44 92.44 | 195.38 195.56 | 431.25 431.44 | //: 32-bit, MSC_v9.00 [ C =332]
+ 64_ || 67.59 67.78 | 97.03 97.13 | 215.53 215.63 | //: 32-bit, MSC_v9.00 [ C =332]
+ 100_ || 70.26 70.32 | 91.92 91.92 | 138.00 138.06 | //: 32-bit, MSC_v9.00 [ C =332]
+ 128_ || 54.98 55.08 | 71.44 71.48 | 107.58 107.58 | //: 32-bit, MSC_v9.00 [ C =332]
+ 256_ || 48.68 48.70 | 58.57 58.57 | 79.83 79.83 | //: 32-bit, MSC_v9.00 [ C =332]
+ 512_ || 45.43 45.46 | 52.22 52.23 | 65.98 66.01 | //: 32-bit, MSC_v9.00 [ C =332]
+ 1000_ || 44.88 44.89 | 50.20 50.20 | 60.44 60.45 | //: 32-bit, MSC_v9.00 [ C =332]
+ 1024_ || 43.81 43.81 | 48.98 48.99 | 59.00 59.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 2048_ || 43.00 43.00 | 47.36 47.37 | 55.50 55.50 | //: 32-bit, MSC_v9.00 [ C =332]
+ 4096_ || 42.59 42.59 | 46.56 46.57 | 53.75 53.75 | //: 32-bit, MSC_v9.00 [ C =332]
+ 8192_ || 42.38 42.39 | 46.16 46.16 | 52.87 52.87 | //: 32-bit, MSC_v9.00 [ C =332]
+ 10000_ || 42.42 42.42 | 46.30 46.31 | 53.29 53.31 | //: 32-bit, MSC_v9.00 [ C =332]
+ 16384_ || 42.28 42.60 | 45.96 46.75 | 52.45 52.52 | //: 32-bit, MSC_v9.00 [ C =332]
+ 32768_ || 42.25 42.36 | 45.84 45.85 | 52.30 52.32 | //: 32-bit, MSC_v9.00 [ C =332]
+ 100000_ || 42.21 42.50 | 43.60 45.77 | 49.55 50.03 | //: 32-bit, MSC_v9.00 [ C =332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+ Block || 4560 bytes | 9232 bytes | 12560 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:30:31,Oct 7 2008 by 'MSC_v6.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 9054.00 9060.00 | 17406.00 17514.00 | 36888.00 37032.00 | //: 32-bit, MSC_v6.00 [ C =332]
+ 2_ || 4341.00 4341.00 | 9129.00 9168.00 | 18273.00 18423.00 | //: 32-bit, MSC_v6.00 [ C =332]
+ 4_ || 2169.00 2170.50 | 4590.00 4636.50 | 9240.00 9334.50 | //: 32-bit, MSC_v6.00 [ C =332]
+ 8_ || 1083.75 1084.50 | 2223.00 2243.25 | 4581.75 4663.50 | //: 32-bit, MSC_v6.00 [ C =332]
+ 10_ || 867.00 867.60 | 1776.60 1790.40 | 3648.00 3672.00 | //: 32-bit, MSC_v6.00 [ C =332]
+ 16_ || 541.13 541.50 | 1044.75 1051.50 | 2274.38 2278.50 | //: 32-bit, MSC_v6.00 [ C =332]
+ 32_ || 271.13 271.88 | 567.56 573.75 | 1139.25 1140.00 | //: 32-bit, MSC_v6.00 [ C =332]
+ 64_ || 201.09 201.09 | 270.84 272.81 | 569.72 571.59 | //: 32-bit, MSC_v6.00 [ C =332]
+ 100_ || 212.70 213.24 | 261.12 262.02 | 365.16 365.28 | //: 32-bit, MSC_v6.00 [ C =332]
+ 128_ || 166.08 166.45 | 204.84 205.41 | 284.48 288.80 | //: 32-bit, MSC_v6.00 [ C =332]
+ 256_ || 148.69 149.34 | 169.59 169.95 | 221.65 221.79 | //: 32-bit, MSC_v6.00 [ C =332]
+ 512_ || 140.47 140.53 | 148.24 148.48 | 179.11 179.11 | //: 32-bit, MSC_v6.00 [ C =332]
+ 1000_ || 139.66 139.66 | 139.37 139.82 | 163.58 165.01 | //: 32-bit, MSC_v6.00 [ C =332]
+ 1024_ || 136.26 136.27 | 141.22 141.49 | 157.43 158.56 | //: 32-bit, MSC_v6.00 [ C =332]
+ 2048_ || 134.25 134.25 | 135.90 137.12 | 151.73 152.42 | //: 32-bit, MSC_v6.00 [ C =332]
+ 4096_ || 133.89 134.06 | 131.19 134.61 | 147.72 150.40 | //: 32-bit, MSC_v6.00 [ C =332]
+ 8192_ || 132.87 134.43 | 134.17 135.04 | 143.82 148.00 | //: 32-bit, MSC_v6.00 [ C =332]
+ 10000_ || 133.42 134.41 | 124.60 130.22 | 137.58 138.21 | //: 32-bit, MSC_v6.00 [ C =332]
+ 16384_ || 131.73 132.54 | 121.35 122.08 | 135.51 139.44 | //: 32-bit, MSC_v6.00 [ C =332]
+ 32768_ || 124.97 134.41 | 128.74 129.78 | 142.57 143.53 | //: 32-bit, MSC_v6.00 [ C =332]
+ 100000_ || 126.77 134.34 | 126.62 129.40 | 135.08 139.54 | //: 32-bit, MSC_v6.00 [ C =332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1486 bytes | 1348 bytes | 1445 bytes | //: 32-bit, MSC_v6.00 [ C =332]
+ Block || 6038 bytes | 13395 bytes | 15975 bytes | //: 32-bit, MSC_v6.00 [ C =332]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:30:42,Oct 7 2008 by 'MSC_v4.20', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 5436.00 5436.00 | 9474.00 9474.00 | 20430.00 20442.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 2_ || 2715.00 2718.00 | 4731.00 4734.00 | 10215.00 10218.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 4_ || 1359.00 1360.50 | 2364.00 2364.00 | 5098.50 5103.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 8_ || 680.25 680.25 | 1182.00 1182.75 | 2549.25 2550.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 10_ || 538.80 543.60 | 945.60 946.20 | 2042.40 2043.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 16_ || 339.38 339.75 | 590.63 591.00 | 1274.63 1275.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 32_ || 166.69 166.69 | 295.31 295.50 | 637.13 637.50 | //: 32-bit, MSC_v4.20 [ C =332]
+ 64_ || 123.47 123.66 | 147.28 147.38 | 318.56 318.75 | //: 32-bit, MSC_v4.20 [ C =332]
+ 100_ || 130.62 130.74 | 140.28 140.76 | 203.94 204.00 | //: 32-bit, MSC_v4.20 [ C =332]
+ 128_ || 101.44 101.67 | 109.31 109.45 | 159.14 159.33 | //: 32-bit, MSC_v4.20 [ C =332]
+ 256_ || 90.70 90.75 | 90.52 90.56 | 118.66 118.83 | //: 32-bit, MSC_v4.20 [ C =332]
+ 512_ || 85.00 85.03 | 80.81 80.87 | 98.46 98.46 | //: 32-bit, MSC_v4.20 [ C =332]
+ 1000_ || 84.00 84.01 | 77.78 77.78 | 90.31 90.32 | //: 32-bit, MSC_v4.20 [ C =332]
+ 1024_ || 81.99 82.13 | 75.93 75.93 | 88.18 88.18 | //: 32-bit, MSC_v4.20 [ C =332]
+ 2048_ || 80.66 80.68 | 73.43 73.43 | 83.02 83.02 | //: 32-bit, MSC_v4.20 [ C =332]
+ 4096_ || 79.98 80.38 | 72.23 72.27 | 80.45 80.45 | //: 32-bit, MSC_v4.20 [ C =332]
+ 8192_ || 79.63 80.15 | 71.66 71.73 | 79.15 79.22 | //: 32-bit, MSC_v4.20 [ C =332]
+ 10000_ || 79.65 80.07 | 71.85 72.33 | 79.79 79.82 | //: 32-bit, MSC_v4.20 [ C =332]
+ 16384_ || 79.66 79.71 | 71.40 71.41 | 78.77 78.85 | //: 32-bit, MSC_v4.20 [ C =332]
+ 32768_ || 75.39 79.68 | 67.83 71.25 | 78.23 78.50 | //: 32-bit, MSC_v4.20 [ C =332]
+ 100000_ || 75.49 77.32 | 67.60 67.87 | 74.33 75.55 | //: 32-bit, MSC_v4.20 [ C =332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1152 bytes | 1024 bytes | 1088 bytes | //: 32-bit, MSC_v4.20 [ C =332]
+ Block || 4736 bytes | 8976 bytes | 12896 bytes | //: 32-bit, MSC_v4.20 [ C =332]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:30:49,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 780.00 798.00 | 1920.00 1920.00 | 3732.00 3732.00 | //: 64-bit, MSC_v9.00 [ C =332]
+ 2_ || 387.00 387.00 | 951.00 951.00 | 1866.00 1869.00 | //: 64-bit, MSC_v9.00 [ C =332]
+ 4_ || 199.50 199.50 | 477.00 480.00 | 936.00 936.00 | //: 64-bit, MSC_v9.00 [ C =332]
+ 8_ || 95.25 96.00 | 231.75 235.50 | 467.25 468.00 | //: 64-bit, MSC_v9.00 [ C =332]
+ 10_ || 76.80 76.80 | 189.00 191.40 | 402.60 402.60 | //: 64-bit, MSC_v9.00 [ C =332]
+ 16_ || 51.75 51.75 | 127.13 128.63 | 234.75 254.25 | //: 64-bit, MSC_v9.00 [ C =332]
+ 32_ || 23.63 23.63 | 58.13 58.31 | 115.69 115.69 | //: 64-bit, MSC_v9.00 [ C =332]
+ 64_ || 16.69 16.88 | 28.88 28.97 | 58.31 58.31 | //: 64-bit, MSC_v9.00 [ C =332]
+ 100_ || 17.10 17.16 | 27.66 27.90 | 37.62 37.62 | //: 64-bit, MSC_v9.00 [ C =332]
+ 128_ || 12.98 13.13 | 21.14 21.47 | 29.16 29.16 | //: 64-bit, MSC_v9.00 [ C =332]
+ 256_ || 11.27 11.30 | 17.04 17.18 | 21.66 21.73 | //: 64-bit, MSC_v9.00 [ C =332]
+ 512_ || 10.20 10.20 | 16.21 16.21 | 17.79 17.82 | //: 64-bit, MSC_v9.00 [ C =332]
+ 1000_ || 9.98 10.12 | 14.23 14.25 | 16.13 16.13 | //: 64-bit, MSC_v9.00 [ C =332]
+ 1024_ || 9.73 10.54 | 13.88 13.89 | 15.73 15.73 | //: 64-bit, MSC_v9.00 [ C =332]
+ 2048_ || 9.48 9.48 | 13.51 13.51 | 14.70 14.70 | //: 64-bit, MSC_v9.00 [ C =332]
+ 4096_ || 9.35 9.36 | 13.21 13.22 | 14.16 14.16 | //: 64-bit, MSC_v9.00 [ C =332]
+ 8192_ || 9.25 9.25 | 13.08 13.08 | 13.93 13.93 | //: 64-bit, MSC_v9.00 [ C =332]
+ 10000_ || 9.27 9.28 | 12.89 12.99 | 13.98 13.98 | //: 64-bit, MSC_v9.00 [ C =332]
+ 16384_ || 9.26 9.28 | 12.77 12.89 | 13.74 13.74 | //: 64-bit, MSC_v9.00 [ C =332]
+ 32768_ || 9.23 9.25 | 12.83 13.09 | 13.77 14.27 | //: 64-bit, MSC_v9.00 [ C =332]
+ 100000_ || 9.32 9.56 | 13.12 13.19 | 14.15 14.23 | //: 64-bit, MSC_v9.00 [ C =332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+ Block || 1200 bytes | 2928 bytes | 5008 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:30:52,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 5958.00 5958.00 | 10182.00 10188.00 | 21522.00 21522.00 | //: 32-bit, BCC_v5.51 [ C =332]
+ 2_ || 3228.00 3228.00 | 5091.00 5091.00 | 10761.00 10761.00 | //: 32-bit, BCC_v5.51 [ C =332]
+ 4_ || 1491.00 1491.00 | 2544.00 2544.00 | 5374.50 5821.50 | //: 32-bit, BCC_v5.51 [ C =332]
+ 8_ || 744.75 745.50 | 1272.00 1272.00 | 2686.50 2686.50 | //: 32-bit, BCC_v5.51 [ C =332]
+ 10_ || 595.80 595.80 | 1017.60 1102.80 | 2151.60 2152.20 | //: 32-bit, BCC_v5.51 [ C =332]
+ 16_ || 372.00 372.38 | 636.00 636.00 | 1343.25 1343.62 | //: 32-bit, BCC_v5.51 [ C =332]
+ 32_ || 184.69 184.69 | 317.62 317.81 | 671.62 671.62 | //: 32-bit, BCC_v5.51 [ C =332]
+ 64_ || 135.56 135.56 | 171.56 171.56 | 335.72 335.81 | //: 32-bit, BCC_v5.51 [ C =332]
+ 100_ || 141.90 141.90 | 150.18 150.18 | 232.68 232.68 | //: 32-bit, BCC_v5.51 [ C =332]
+ 128_ || 119.67 119.67 | 126.84 126.84 | 181.50 181.55 | //: 32-bit, BCC_v5.51 [ C =332]
+ 256_ || 106.15 106.15 | 104.25 104.27 | 135.00 135.02 | //: 32-bit, BCC_v5.51 [ C =332]
+ 512_ || 99.42 99.46 | 93.00 93.01 | 111.75 111.76 | //: 32-bit, BCC_v5.51 [ C =332]
+ 1000_ || 98.08 98.08 | 89.47 89.47 | 102.47 102.49 | //: 32-bit, BCC_v5.51 [ C =332]
+ 1024_ || 95.70 95.75 | 87.33 87.33 | 100.03 100.04 | //: 32-bit, BCC_v5.51 [ C =332]
+ 2048_ || 86.60 86.77 | 78.01 78.01 | 86.94 86.94 | //: 32-bit, BCC_v5.51 [ C =332]
+ 4096_ || 85.95 85.96 | 76.71 76.71 | 84.25 84.25 | //: 32-bit, BCC_v5.51 [ C =332]
+ 8192_ || 85.56 92.63 | 76.07 82.40 | 89.80 90.33 | //: 32-bit, BCC_v5.51 [ C =332]
+ 10000_ || 86.68 90.16 | 79.43 81.88 | 89.21 90.98 | //: 32-bit, BCC_v5.51 [ C =332]
+ 16384_ || 87.25 92.78 | 82.30 82.31 | 89.34 89.41 | //: 32-bit, BCC_v5.51 [ C =332]
+ 32768_ || 92.38 92.56 | 81.90 82.46 | 89.01 89.03 | //: 32-bit, BCC_v5.51 [ C =332]
+ 100000_ || 88.00 88.57 | 78.54 81.88 | 84.53 88.34 | //: 32-bit, BCC_v5.51 [ C =332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [ C =332]
+ Block || 4340 bytes | 7660 bytes | 10408 bytes | //: 32-bit, BCC_v5.51 [ C =332]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:31:00,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2784.00 2784.00 | 5094.00 5100.00 | 10800.00 10806.00 | //: 32-bit, BCC_v5.51 [asm=332]
+ 2_ || 1374.00 1377.00 | 2538.00 2538.00 | 5370.00 5373.00 | //: 32-bit, BCC_v5.51 [asm=332]
+ 4_ || 687.00 687.00 | 1267.50 1267.50 | 2695.50 2697.00 | //: 32-bit, BCC_v5.51 [asm=332]
+ 8_ || 341.25 341.25 | 633.00 633.75 | 1348.50 1348.50 | //: 32-bit, BCC_v5.51 [asm=332]
+ 10_ || 278.40 282.00 | 512.40 513.00 | 1089.00 1089.00 | //: 32-bit, BCC_v5.51 [asm=332]
+ 16_ || 172.50 172.50 | 320.25 320.25 | 678.00 679.12 | //: 32-bit, BCC_v5.51 [asm=332]
+ 32_ || 85.88 85.88 | 159.38 159.56 | 339.75 339.75 | //: 32-bit, BCC_v5.51 [asm=332]
+ 64_ || 61.03 61.12 | 79.31 79.41 | 169.78 169.78 | //: 32-bit, BCC_v5.51 [asm=332]
+ 100_ || 62.40 62.46 | 73.92 73.98 | 108.48 108.72 | //: 32-bit, BCC_v5.51 [asm=332]
+ 128_ || 48.28 48.28 | 57.47 57.47 | 84.14 84.14 | //: 32-bit, BCC_v5.51 [asm=332]
+ 256_ || 42.00 42.02 | 46.03 46.31 | 61.90 61.90 | //: 32-bit, BCC_v5.51 [asm=332]
+ 512_ || 37.96 39.39 | 41.17 41.18 | 51.16 51.33 | //: 32-bit, BCC_v5.51 [asm=332]
+ 1000_ || 37.45 37.74 | 38.66 62.12 | 45.85 69.53 | //: 32-bit, BCC_v5.51 [asm=332]
+ 1024_ || 36.34 58.89 | 37.42 60.09 | 44.31 44.62 | //: 32-bit, BCC_v5.51 [asm=332]
+ 2048_ || 35.77 36.18 | 36.06 36.34 | 42.06 42.07 | //: 32-bit, BCC_v5.51 [asm=332]
+ 4096_ || 35.04 35.44 | 35.33 35.33 | 40.48 48.07 | //: 32-bit, BCC_v5.51 [asm=332]
+ 8192_ || 34.80 43.64 | 35.91 35.92 | 40.46 40.66 | //: 32-bit, BCC_v5.51 [asm=332]
+ 10000_ || 35.05 36.40 | 35.54 37.36 | 41.01 54.16 | //: 32-bit, BCC_v5.51 [asm=332]
+ 16384_ || 34.92 36.14 | 35.74 40.79 | 40.28 43.83 | //: 32-bit, BCC_v5.51 [asm=332]
+ 32768_ || 35.39 38.29 | 35.19 37.31 | 39.88 40.94 | //: 32-bit, BCC_v5.51 [asm=332]
+ 100000_ || 36.40 38.36 | 35.18 37.16 | 40.05 40.36 | //: 32-bit, BCC_v5.51 [asm=332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [asm=332]
+ Block || 3060 bytes | 6300 bytes | 8835 bytes | //: 32-bit, BCC_v5.51 [asm=332]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:31:04,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2598.00 2604.00 | 4866.00 4878.00 | 10614.00 10632.00 | //: 32-bit, MSC_v9.00 [asm=332]
+ 2_ || 1290.00 1299.00 | 2451.00 2460.00 | 5331.00 5334.00 | //: 32-bit, MSC_v9.00 [asm=332]
+ 4_ || 648.00 649.50 | 1222.50 1222.50 | 2647.50 2656.50 | //: 32-bit, MSC_v9.00 [asm=332]
+ 8_ || 326.25 327.75 | 612.00 614.25 | 1330.50 1332.75 | //: 32-bit, MSC_v9.00 [asm=332]
+ 10_ || 260.40 261.00 | 490.20 490.20 | 1067.40 1067.40 | //: 32-bit, MSC_v9.00 [asm=332]
+ 16_ || 162.38 162.38 | 306.00 306.00 | 661.50 661.88 | //: 32-bit, MSC_v9.00 [asm=332]
+ 32_ || 80.81 80.81 | 153.94 153.94 | 333.75 333.94 | //: 32-bit, MSC_v9.00 [asm=332]
+ 64_ || 58.78 58.78 | 76.13 76.41 | 166.88 166.88 | //: 32-bit, MSC_v9.00 [asm=332]
+ 100_ || 60.78 60.78 | 72.00 72.00 | 106.86 106.92 | //: 32-bit, MSC_v9.00 [asm=332]
+ 128_ || 47.58 47.63 | 55.92 55.92 | 83.16 83.20 | //: 32-bit, MSC_v9.00 [asm=332]
+ 256_ || 42.05 42.05 | 45.75 45.75 | 61.59 61.64 | //: 32-bit, MSC_v9.00 [asm=332]
+ 512_ || 39.18 39.19 | 40.82 41.02 | 50.93 51.02 | //: 32-bit, MSC_v9.00 [asm=332]
+ 1000_ || 38.38 38.42 | 39.17 39.19 | 46.49 46.61 | //: 32-bit, MSC_v9.00 [asm=332]
+ 1024_ || 37.38 37.78 | 38.02 60.78 | 67.69 68.44 | //: 32-bit, MSC_v9.00 [asm=332]
+ 2048_ || 36.80 48.25 | 36.66 48.20 | 42.67 42.81 | //: 32-bit, MSC_v9.00 [asm=332]
+ 4096_ || 36.57 36.59 | 36.25 36.26 | 41.31 41.40 | //: 32-bit, MSC_v9.00 [asm=332]
+ 8192_ || 36.21 36.30 | 35.84 38.76 | 40.68 40.71 | //: 32-bit, MSC_v9.00 [asm=332]
+ 10000_ || 40.98 47.69 | 35.81 35.86 | 40.96 43.93 | //: 32-bit, MSC_v9.00 [asm=332]
+ 16384_ || 36.27 38.04 | 35.77 43.58 | 40.33 43.27 | //: 32-bit, MSC_v9.00 [asm=332]
+ 32768_ || 36.04 41.09 | 35.57 35.89 | 40.17 40.36 | //: 32-bit, MSC_v9.00 [asm=332]
+ 100000_ || 34.46 36.34 | 34.07 37.16 | 39.60 43.18 | //: 32-bit, MSC_v9.00 [asm=332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+ Block || 3060 bytes | 6300 bytes | 8835 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:31:10,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2550.00 2568.00 | 4896.00 4902.00 | 10662.00 10728.00 | //: 32-bit, GCC_v3.42 [asm=332]
+ 2_ || 1275.00 1290.00 | 2445.00 2457.00 | 5355.00 5379.00 | //: 32-bit, GCC_v3.42 [asm=332]
+ 4_ || 640.50 645.00 | 1224.00 1225.50 | 2655.00 2668.50 | //: 32-bit, GCC_v3.42 [asm=332]
+ 8_ || 318.75 320.25 | 609.75 610.50 | 1328.25 1332.75 | //: 32-bit, GCC_v3.42 [asm=332]
+ 10_ || 254.40 257.40 | 488.40 490.20 | 1070.40 1074.60 | //: 32-bit, GCC_v3.42 [asm=332]
+ 16_ || 161.25 162.00 | 306.38 307.13 | 669.00 671.25 | //: 32-bit, GCC_v3.42 [asm=332]
+ 32_ || 79.88 80.06 | 153.56 153.75 | 333.00 335.63 | //: 32-bit, GCC_v3.42 [asm=332]
+ 64_ || 58.50 58.69 | 76.50 76.59 | 166.69 167.34 | //: 32-bit, GCC_v3.42 [asm=332]
+ 100_ || 60.78 61.02 | 72.36 73.08 | 107.04 107.58 | //: 32-bit, GCC_v3.42 [asm=332]
+ 128_ || 47.39 47.44 | 56.06 56.30 | 83.44 83.63 | //: 32-bit, GCC_v3.42 [asm=332]
+ 256_ || 41.79 41.88 | 46.03 46.10 | 61.71 61.92 | //: 32-bit, GCC_v3.42 [asm=332]
+ 512_ || 39.20 39.33 | 40.96 41.10 | 51.23 51.30 | //: 32-bit, GCC_v3.42 [asm=332]
+ 1000_ || 38.40 38.57 | 39.23 39.26 | 46.83 47.06 | //: 32-bit, GCC_v3.42 [asm=332]
+ 1024_ || 37.53 37.72 | 38.27 38.33 | 45.78 46.00 | //: 32-bit, GCC_v3.42 [asm=332]
+ 2048_ || 36.94 37.00 | 37.03 37.15 | 43.10 56.43 | //: 32-bit, GCC_v3.42 [asm=332]
+ 4096_ || 41.78 53.31 | 36.01 40.53 | 40.97 41.13 | //: 32-bit, GCC_v3.42 [asm=332]
+ 8192_ || 35.90 36.04 | 35.84 48.31 | 40.53 40.55 | //: 32-bit, GCC_v3.42 [asm=332]
+ 10000_ || 36.42 36.48 | 35.85 46.01 | 40.60 40.74 | //: 32-bit, GCC_v3.42 [asm=332]
+ 16384_ || 36.20 39.37 | 35.61 38.72 | 40.15 41.33 | //: 32-bit, GCC_v3.42 [asm=332]
+ 32768_ || 36.47 40.53 | 35.81 39.15 | 40.13 41.96 | //: 32-bit, GCC_v3.42 [asm=332]
+ 100000_ || 36.70 43.77 | 35.89 37.72 | 40.20 44.66 | //: 32-bit, GCC_v3.42 [asm=332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [asm=332]
+ Block || 3060 bytes | 6300 bytes | 8835 bytes | //: 32-bit, GCC_v3.42 [asm=332]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:31:14,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 684.00 690.00 | 1104.00 1104.00 | 2028.00 2034.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 2_ || 339.00 339.00 | 549.00 549.00 | 1014.00 1017.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 4_ || 168.00 169.50 | 276.00 276.00 | 507.00 508.50 | //: 64-bit, MSC_v9.00 [asm=332]
+ 8_ || 81.75 81.75 | 135.00 146.25 | 273.75 273.75 | //: 64-bit, MSC_v9.00 [asm=332]
+ 10_ || 70.80 70.80 | 120.00 120.00 | 219.00 219.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 16_ || 44.25 44.25 | 74.25 74.25 | 126.00 126.38 | //: 64-bit, MSC_v9.00 [asm=332]
+ 32_ || 20.06 20.25 | 33.75 33.75 | 63.00 63.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 64_ || 14.53 14.53 | 16.69 16.97 | 34.13 34.13 | //: 64-bit, MSC_v9.00 [asm=332]
+ 100_ || 15.72 15.72 | 16.74 16.74 | 22.20 22.20 | //: 64-bit, MSC_v9.00 [asm=332]
+ 128_ || 11.06 11.11 | 11.77 11.81 | 15.70 15.70 | //: 64-bit, MSC_v9.00 [asm=332]
+ 256_ || 9.52 9.52 | 9.05 9.07 | 12.38 12.40 | //: 64-bit, MSC_v9.00 [asm=332]
+ 512_ || 9.35 9.35 | 7.72 7.72 | 9.26 9.26 | //: 64-bit, MSC_v9.00 [asm=332]
+ 1000_ || 8.42 8.42 | 7.22 7.22 | 8.30 8.30 | //: 64-bit, MSC_v9.00 [asm=332]
+ 1024_ || 8.19 8.87 | 7.62 7.63 | 8.12 8.12 | //: 64-bit, MSC_v9.00 [asm=332]
+ 2048_ || 7.97 7.97 | 7.25 7.38 | 7.52 8.15 | //: 64-bit, MSC_v9.00 [asm=332]
+ 4096_ || 7.86 7.88 | 6.54 7.09 | 7.84 11.52 | //: 64-bit, MSC_v9.00 [asm=332]
+ 8192_ || 8.49 11.80 | 9.78 10.72 | 7.05 10.38 | //: 64-bit, MSC_v9.00 [asm=332]
+ 10000_ || 7.85 8.51 | 6.58 6.58 | 7.11 7.12 | //: 64-bit, MSC_v9.00 [asm=332]
+ 16384_ || 7.86 7.88 | 6.41 6.41 | 7.00 7.01 | //: 64-bit, MSC_v9.00 [asm=332]
+ 32768_ || 7.89 9.85 | 6.50 7.00 | 6.94 6.97 | //: 64-bit, MSC_v9.00 [asm=332]
+ 100000_ || 7.80 9.43 | 6.90 7.71 | 7.18 8.48 | //: 64-bit, MSC_v9.00 [asm=332]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+ Block || 1288 bytes | 2182 bytes | 3449 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:31:16,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 3954.00 3966.00 | 7350.00 7350.00 | 40698.00 40704.00 | //: 32-bit, GCC_v3.42 [ C =335]
+ 2_ || 1977.00 1977.00 | 3678.00 3678.00 | 22035.00 24258.00 | //: 32-bit, GCC_v3.42 [ C =335]
+ 4_ || 1072.50 1072.50 | 1837.50 1839.00 | 10161.00 11007.00 | //: 32-bit, GCC_v3.42 [ C =335]
+ 8_ || 495.00 495.75 | 993.75 999.75 | 5456.25 8527.50 | //: 32-bit, GCC_v3.42 [ C =335]
+ 10_ || 429.60 430.80 | 730.80 801.60 | 4380.00 5951.40 | //: 32-bit, GCC_v3.42 [ C =335]
+ 16_ || 249.00 269.25 | 499.13 502.13 | 2741.63 4381.13 | //: 32-bit, GCC_v3.42 [ C =335]
+ 32_ || 133.31 135.19 | 249.19 251.25 | 1369.69 2140.69 | //: 32-bit, GCC_v3.42 [ C =335]
+ 64_ || 90.84 99.00 | 114.47 123.94 | 635.06 635.16 | //: 32-bit, GCC_v3.42 [ C =335]
+ 100_ || 95.28 103.56 | 108.90 109.44 | 406.50 580.20 | //: 32-bit, GCC_v3.42 [ C =335]
+ 128_ || 74.44 80.44 | 85.50 91.88 | 317.25 317.30 | //: 32-bit, GCC_v3.42 [ C =335]
+ 256_ || 66.00 71.48 | 69.70 69.80 | 237.12 237.12 | //: 32-bit, GCC_v3.42 [ C =335]
+ 512_ || 66.96 66.98 | 67.36 67.39 | 224.53 224.58 | //: 32-bit, GCC_v3.42 [ C =335]
+ 1000_ || 66.20 66.21 | 64.66 64.73 | 205.97 206.02 | //: 32-bit, GCC_v3.42 [ C =335]
+ 1024_ || 64.61 87.60 | 63.19 63.20 | 175.51 194.46 | //: 32-bit, GCC_v3.42 [ C =335]
+ 2048_ || 58.87 66.35 | 56.44 60.15 | 165.23 193.47 | //: 32-bit, GCC_v3.42 [ C =335]
+ 4096_ || 58.09 71.92 | 55.50 55.51 | 168.27 197.84 | //: 32-bit, GCC_v3.42 [ C =335]
+ 8192_ || 57.83 61.97 | 55.02 64.00 | 173.49 203.78 | //: 32-bit, GCC_v3.42 [ C =335]
+ 10000_ || 62.69 63.71 | 59.52 62.18 | 176.01 194.46 | //: 32-bit, GCC_v3.42 [ C =335]
+ 16384_ || 62.78 65.42 | 59.37 63.71 | 182.36 201.21 | //: 32-bit, GCC_v3.42 [ C =335]
+ 32768_ || 62.48 70.41 | 59.61 63.00 | 184.14 189.59 | //: 32-bit, GCC_v3.42 [ C =335]
+ 100000_ || 61.14 68.82 | 59.72 62.87 | 190.22 202.45 | //: 32-bit, GCC_v3.42 [ C =335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [ C =335]
+ Block || 6640 bytes | 13040 bytes | 41968 bytes | //: 32-bit, GCC_v3.42 [ C =335]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:31:27,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2994.00 2994.00 | 6240.00 6240.00 | 14598.00 14604.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 2_ || 1488.00 1494.00 | 3123.00 3126.00 | 7308.00 7311.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 4_ || 744.00 745.50 | 1558.50 1558.50 | 3646.50 3648.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 8_ || 372.00 372.00 | 779.25 780.00 | 1827.75 1827.75 | //: 32-bit, MSC_v9.00 [ C =335]
+ 10_ || 300.60 301.20 | 624.00 624.60 | 1459.20 1461.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 16_ || 187.13 187.88 | 389.63 389.63 | 913.50 913.88 | //: 32-bit, MSC_v9.00 [ C =335]
+ 32_ || 92.63 93.19 | 195.38 195.56 | 456.56 456.56 | //: 32-bit, MSC_v9.00 [ C =335]
+ 64_ || 67.69 67.78 | 97.03 97.13 | 228.66 228.75 | //: 32-bit, MSC_v9.00 [ C =335]
+ 100_ || 70.62 70.68 | 91.86 91.92 | 146.10 146.16 | //: 32-bit, MSC_v9.00 [ C =335]
+ 128_ || 54.84 55.27 | 71.48 71.48 | 112.88 112.92 | //: 32-bit, MSC_v9.00 [ C =335]
+ 256_ || 48.49 48.54 | 58.48 58.50 | 83.37 83.48 | //: 32-bit, MSC_v9.00 [ C =335]
+ 512_ || 45.42 45.42 | 52.07 52.23 | 68.57 68.60 | //: 32-bit, MSC_v9.00 [ C =335]
+ 1000_ || 44.65 44.65 | 50.20 50.20 | 62.74 62.76 | //: 32-bit, MSC_v9.00 [ C =335]
+ 1024_ || 43.80 43.80 | 48.98 48.99 | 61.13 61.14 | //: 32-bit, MSC_v9.00 [ C =335]
+ 2048_ || 43.00 43.00 | 47.36 47.37 | 57.45 57.47 | //: 32-bit, MSC_v9.00 [ C =335]
+ 4096_ || 42.33 42.34 | 46.57 46.57 | 55.59 55.60 | //: 32-bit, MSC_v9.00 [ C =335]
+ 8192_ || 42.21 42.25 | 46.16 46.17 | 54.66 54.67 | //: 32-bit, MSC_v9.00 [ C =335]
+ 10000_ || 42.16 42.42 | 46.31 46.73 | 55.11 55.13 | //: 32-bit, MSC_v9.00 [ C =335]
+ 16384_ || 42.28 42.29 | 46.21 46.24 | 54.20 54.24 | //: 32-bit, MSC_v9.00 [ C =335]
+ 32768_ || 42.35 42.36 | 45.95 46.10 | 50.90 51.80 | //: 32-bit, MSC_v9.00 [ C =335]
+ 100000_ || 40.09 40.55 | 45.76 45.97 | 51.00 53.08 | //: 32-bit, MSC_v9.00 [ C =335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+ Block || 4560 bytes | 9232 bytes | 29280 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:31:35,Oct 7 2008 by 'MSC_v6.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 8670.00 8682.00 | 17376.00 17436.00 | 55890.00 56922.00 | //: 32-bit, MSC_v6.00 [ C =335]
+ 2_ || 4344.00 4347.00 | 9315.00 9414.00 | 27747.00 27771.00 | //: 32-bit, MSC_v6.00 [ C =335]
+ 4_ || 2164.50 2164.50 | 4500.00 4522.50 | 13807.50 13896.00 | //: 32-bit, MSC_v6.00 [ C =335]
+ 8_ || 1082.25 1088.25 | 2235.00 2245.50 | 6796.50 6931.50 | //: 32-bit, MSC_v6.00 [ C =335]
+ 10_ || 866.40 871.80 | 1800.60 1811.40 | 5465.40 5560.80 | //: 32-bit, MSC_v6.00 [ C =335]
+ 16_ || 548.25 548.25 | 1124.25 1129.88 | 3447.00 3447.75 | //: 32-bit, MSC_v6.00 [ C =335]
+ 32_ || 272.25 272.63 | 560.44 563.81 | 1723.50 1723.88 | //: 32-bit, MSC_v6.00 [ C =335]
+ 64_ || 202.59 203.44 | 282.47 283.50 | 837.00 840.09 | //: 32-bit, MSC_v6.00 [ C =335]
+ 100_ || 214.38 215.88 | 269.28 269.82 | 532.74 532.86 | //: 32-bit, MSC_v6.00 [ C =335]
+ 128_ || 167.63 167.67 | 208.64 210.70 | 418.03 421.73 | //: 32-bit, MSC_v6.00 [ C =335]
+ 256_ || 149.41 150.16 | 173.23 173.79 | 317.27 320.23 | //: 32-bit, MSC_v6.00 [ C =335]
+ 512_ || 147.41 147.48 | 148.73 148.88 | 265.04 265.10 | //: 32-bit, MSC_v6.00 [ C =335]
+ 1000_ || 140.53 140.53 | 144.00 144.20 | 244.22 250.33 | //: 32-bit, MSC_v6.00 [ C =335]
+ 1024_ || 141.71 141.73 | 142.42 142.59 | 235.18 235.96 | //: 32-bit, MSC_v6.00 [ C =335]
+ 2048_ || 135.36 135.38 | 137.08 137.16 | 220.27 221.11 | //: 32-bit, MSC_v6.00 [ C =335]
+ 4096_ || 124.41 128.93 | 123.98 126.91 | 200.32 204.77 | //: 32-bit, MSC_v6.00 [ C =335]
+ 8192_ || 124.35 126.84 | 124.37 130.27 | 204.17 219.92 | //: 32-bit, MSC_v6.00 [ C =335]
+ 10000_ || 126.45 133.37 | 133.76 134.30 | 217.48 218.27 | //: 32-bit, MSC_v6.00 [ C =335]
+ 16384_ || 133.14 135.72 | 128.15 128.86 | 191.67 194.97 | //: 32-bit, MSC_v6.00 [ C =335]
+ 32768_ || 129.20 133.13 | 127.94 129.85 | 202.98 210.08 | //: 32-bit, MSC_v6.00 [ C =335]
+ 100000_ || 130.83 133.01 | 121.08 129.21 | 192.14 200.80 | //: 32-bit, MSC_v6.00 [ C =335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1486 bytes | 1348 bytes | 1445 bytes | //: 32-bit, MSC_v6.00 [ C =335]
+ Block || 6038 bytes | 13395 bytes | 37221 bytes | //: 32-bit, MSC_v6.00 [ C =335]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:31:48,Oct 7 2008 by 'MSC_v4.20', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 5430.00 5436.00 | 9474.00 9474.00 | 21330.00 21366.00 | //: 32-bit, MSC_v4.20 [ C =335]
+ 2_ || 2706.00 2724.00 | 4731.00 4734.00 | 10662.00 10683.00 | //: 32-bit, MSC_v4.20 [ C =335]
+ 4_ || 1359.00 1359.00 | 2364.00 2365.50 | 5322.00 5332.50 | //: 32-bit, MSC_v4.20 [ C =335]
+ 8_ || 675.00 679.50 | 1182.00 1182.75 | 2661.75 2666.25 | //: 32-bit, MSC_v4.20 [ C =335]
+ 10_ || 540.00 543.60 | 945.60 945.60 | 2132.40 2136.00 | //: 32-bit, MSC_v4.20 [ C =335]
+ 16_ || 337.50 339.75 | 590.63 591.00 | 1330.50 1332.75 | //: 32-bit, MSC_v4.20 [ C =335]
+ 32_ || 166.69 167.25 | 295.31 295.50 | 665.25 666.38 | //: 32-bit, MSC_v4.20 [ C =335]
+ 64_ || 123.66 123.84 | 147.19 147.28 | 332.63 333.19 | //: 32-bit, MSC_v4.20 [ C =335]
+ 100_ || 130.62 130.98 | 140.28 140.76 | 212.88 213.24 | //: 32-bit, MSC_v4.20 [ C =335]
+ 128_ || 101.44 101.72 | 109.31 109.45 | 166.08 166.41 | //: 32-bit, MSC_v4.20 [ C =335]
+ 256_ || 90.54 90.73 | 90.54 90.56 | 124.99 125.11 | //: 32-bit, MSC_v4.20 [ C =335]
+ 512_ || 84.93 85.02 | 80.95 80.95 | 101.98 101.98 | //: 32-bit, MSC_v4.20 [ C =335]
+ 1000_ || 84.00 84.02 | 77.78 77.79 | 93.61 93.62 | //: 32-bit, MSC_v4.20 [ C =335]
+ 1024_ || 81.96 82.10 | 75.93 75.93 | 91.37 91.39 | //: 32-bit, MSC_v4.20 [ C =335]
+ 2048_ || 80.68 80.69 | 73.49 73.49 | 85.58 85.59 | //: 32-bit, MSC_v4.20 [ C =335]
+ 4096_ || 79.98 80.00 | 72.23 72.24 | 82.21 82.57 | //: 32-bit, MSC_v4.20 [ C =335]
+ 8192_ || 79.62 80.01 | 71.61 72.15 | 80.57 81.37 | //: 32-bit, MSC_v4.20 [ C =335]
+ 10000_ || 79.72 80.04 | 71.86 71.92 | 81.67 81.67 | //: 32-bit, MSC_v4.20 [ C =335]
+ 16384_ || 79.47 79.72 | 67.22 67.80 | 76.81 77.22 | //: 32-bit, MSC_v4.20 [ C =335]
+ 32768_ || 75.20 79.32 | 67.07 68.60 | 75.91 78.02 | //: 32-bit, MSC_v4.20 [ C =335]
+ 100000_ || 75.38 75.82 | 67.48 69.43 | 74.87 77.52 | //: 32-bit, MSC_v4.20 [ C =335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1152 bytes | 1024 bytes | 1088 bytes | //: 32-bit, MSC_v4.20 [ C =335]
+ Block || 4736 bytes | 8976 bytes | 28880 bytes | //: 32-bit, MSC_v4.20 [ C =335]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:31:56,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 780.00 798.00 | 1890.00 1920.00 | 3498.00 3498.00 | //: 64-bit, MSC_v9.00 [ C =335]
+ 2_ || 387.00 387.00 | 951.00 969.00 | 1752.00 1752.00 | //: 64-bit, MSC_v9.00 [ C =335]
+ 4_ || 201.00 201.00 | 477.00 478.50 | 877.50 877.50 | //: 64-bit, MSC_v9.00 [ C =335]
+ 8_ || 95.25 96.00 | 234.75 237.00 | 440.25 441.00 | //: 64-bit, MSC_v9.00 [ C =335]
+ 10_ || 77.40 78.00 | 190.80 195.00 | 350.40 379.20 | //: 64-bit, MSC_v9.00 [ C =335]
+ 16_ || 52.13 52.13 | 126.75 127.13 | 237.00 237.00 | //: 64-bit, MSC_v9.00 [ C =335]
+ 32_ || 23.81 25.69 | 58.69 59.25 | 109.88 109.88 | //: 64-bit, MSC_v9.00 [ C =335]
+ 64_ || 16.97 16.97 | 29.34 29.72 | 54.75 54.75 | //: 64-bit, MSC_v9.00 [ C =335]
+ 100_ || 17.10 17.10 | 27.72 28.08 | 35.28 35.28 | //: 64-bit, MSC_v9.00 [ C =335]
+ 128_ || 13.03 13.03 | 21.19 21.52 | 27.33 27.33 | //: 64-bit, MSC_v9.00 [ C =335]
+ 256_ || 11.20 11.23 | 17.02 17.16 | 20.23 20.25 | //: 64-bit, MSC_v9.00 [ C =335]
+ 512_ || 10.21 10.22 | 14.95 16.15 | 16.56 17.94 | //: 64-bit, MSC_v9.00 [ C =335]
+ 1000_ || 9.95 10.00 | 14.23 14.24 | 15.09 15.10 | //: 64-bit, MSC_v9.00 [ C =335]
+ 1024_ || 9.71 10.50 | 13.91 13.91 | 14.68 14.68 | //: 64-bit, MSC_v9.00 [ C =335]
+ 2048_ || 9.48 9.73 | 13.43 13.51 | 13.73 13.75 | //: 64-bit, MSC_v9.00 [ C =335]
+ 4096_ || 9.36 9.36 | 13.21 13.22 | 13.28 13.28 | //: 64-bit, MSC_v9.00 [ C =335]
+ 8192_ || 9.28 9.31 | 12.83 12.94 | 12.97 14.48 | //: 64-bit, MSC_v9.00 [ C =335]
+ 10000_ || 9.30 10.06 | 12.94 14.10 | 13.07 14.36 | //: 64-bit, MSC_v9.00 [ C =335]
+ 16384_ || 9.25 9.27 | 12.98 13.01 | 12.83 12.83 | //: 64-bit, MSC_v9.00 [ C =335]
+ 32768_ || 9.22 9.24 | 12.81 12.91 | 12.90 12.90 | //: 64-bit, MSC_v9.00 [ C =335]
+ 100000_ || 9.33 9.58 | 13.94 13.95 | 13.24 13.92 | //: 64-bit, MSC_v9.00 [ C =335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+ Block || 1200 bytes | 2928 bytes | 10880 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:32:00,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 5964.00 6462.00 | 10182.00 10188.00 | 21516.00 21516.00 | //: 32-bit, BCC_v5.51 [ C =335]
+ 2_ || 2979.00 2982.00 | 5091.00 5091.00 | 10758.00 10761.00 | //: 32-bit, BCC_v5.51 [ C =335]
+ 4_ || 1489.50 1489.50 | 2545.50 2757.00 | 5374.50 5374.50 | //: 32-bit, BCC_v5.51 [ C =335]
+ 8_ || 744.75 745.50 | 1272.00 1272.00 | 2687.25 2687.25 | //: 32-bit, BCC_v5.51 [ C =335]
+ 10_ || 595.20 595.20 | 1017.60 1017.60 | 2151.60 2152.20 | //: 32-bit, BCC_v5.51 [ C =335]
+ 16_ || 372.38 372.38 | 636.00 636.00 | 1343.25 1343.62 | //: 32-bit, BCC_v5.51 [ C =335]
+ 32_ || 200.25 200.25 | 317.62 317.81 | 671.44 672.00 | //: 32-bit, BCC_v5.51 [ C =335]
+ 64_ || 135.56 135.56 | 158.34 158.34 | 335.72 335.72 | //: 32-bit, BCC_v5.51 [ C =335]
+ 100_ || 141.78 141.84 | 150.18 150.18 | 214.74 214.74 | //: 32-bit, BCC_v5.51 [ C =335]
+ 128_ || 110.44 110.44 | 117.05 117.09 | 167.53 167.53 | //: 32-bit, BCC_v5.51 [ C =335]
+ 256_ || 106.15 106.15 | 104.25 104.27 | 135.00 135.02 | //: 32-bit, BCC_v5.51 [ C =335]
+ 512_ || 99.46 99.50 | 93.00 93.01 | 111.62 111.63 | //: 32-bit, BCC_v5.51 [ C =335]
+ 1000_ || 98.07 98.17 | 89.48 89.48 | 102.39 102.39 | //: 32-bit, BCC_v5.51 [ C =335]
+ 1024_ || 95.84 95.84 | 87.35 87.35 | 99.96 99.96 | //: 32-bit, BCC_v5.51 [ C =335]
+ 2048_ || 93.84 94.01 | 84.51 84.52 | 94.10 94.10 | //: 32-bit, BCC_v5.51 [ C =335]
+ 4096_ || 93.12 93.13 | 83.10 83.10 | 84.16 91.18 | //: 32-bit, BCC_v5.51 [ C =335]
+ 8192_ || 92.65 93.16 | 82.39 82.91 | 89.71 89.79 | //: 32-bit, BCC_v5.51 [ C =335]
+ 10000_ || 92.75 93.17 | 77.57 78.30 | 83.49 85.13 | //: 32-bit, BCC_v5.51 [ C =335]
+ 16384_ || 87.43 88.16 | 77.83 79.58 | 84.22 84.89 | //: 32-bit, BCC_v5.51 [ C =335]
+ 32768_ || 87.33 88.36 | 77.63 78.37 | 84.49 86.56 | //: 32-bit, BCC_v5.51 [ C =335]
+ 100000_ || 87.96 89.42 | 77.90 78.17 | 84.30 85.04 | //: 32-bit, BCC_v5.51 [ C =335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [ C =335]
+ Block || 4340 bytes | 7660 bytes | 24192 bytes | //: 32-bit, BCC_v5.51 [ C =335]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:32:08,Oct 7 2008 by 'BCC_v5.51', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2718.00 2718.00 | 5076.00 5082.00 | 10746.00 10752.00 | //: 32-bit, BCC_v5.51 [asm=335]
+ 2_ || 1359.00 1362.00 | 2499.00 2499.00 | 5373.00 5376.00 | //: 32-bit, BCC_v5.51 [asm=335]
+ 4_ || 679.50 681.00 | 1245.00 1251.00 | 2673.00 2674.50 | //: 32-bit, BCC_v5.51 [asm=335]
+ 8_ || 340.50 340.50 | 622.50 625.50 | 1335.75 1336.50 | //: 32-bit, BCC_v5.51 [asm=335]
+ 10_ || 269.40 269.40 | 499.20 499.80 | 1075.80 1075.80 | //: 32-bit, BCC_v5.51 [asm=335]
+ 16_ || 170.62 170.62 | 310.88 311.25 | 667.50 667.88 | //: 32-bit, BCC_v5.51 [asm=335]
+ 32_ || 82.88 83.06 | 156.00 156.00 | 334.31 334.31 | //: 32-bit, BCC_v5.51 [asm=335]
+ 64_ || 59.25 59.34 | 77.25 77.34 | 166.50 167.53 | //: 32-bit, BCC_v5.51 [asm=335]
+ 100_ || 60.48 60.54 | 72.30 72.36 | 107.16 107.22 | //: 32-bit, BCC_v5.51 [asm=335]
+ 128_ || 47.02 47.02 | 56.30 56.30 | 83.53 84.66 | //: 32-bit, BCC_v5.51 [asm=335]
+ 256_ || 42.21 42.23 | 46.59 46.62 | 62.53 62.53 | //: 32-bit, BCC_v5.51 [asm=335]
+ 512_ || 39.22 39.23 | 41.00 41.00 | 51.18 51.18 | //: 32-bit, BCC_v5.51 [asm=335]
+ 1000_ || 38.02 38.03 | 39.37 39.37 | 46.66 46.67 | //: 32-bit, BCC_v5.51 [asm=335]
+ 1024_ || 34.83 34.83 | 35.55 35.56 | 42.06 42.12 | //: 32-bit, BCC_v5.51 [asm=335]
+ 2048_ || 33.65 33.65 | 34.07 34.07 | 39.49 39.49 | //: 32-bit, BCC_v5.51 [asm=335]
+ 4096_ || 33.34 33.40 | 33.47 33.68 | 38.18 38.18 | //: 32-bit, BCC_v5.51 [asm=335]
+ 8192_ || 32.90 33.36 | 32.87 33.00 | 37.36 38.25 | //: 32-bit, BCC_v5.51 [asm=335]
+ 10000_ || 32.68 33.54 | 33.04 33.37 | 37.54 39.47 | //: 32-bit, BCC_v5.51 [asm=335]
+ 16384_ || 31.89 36.04 | 35.74 35.85 | 40.30 40.32 | //: 32-bit, BCC_v5.51 [asm=335]
+ 32768_ || 35.05 36.16 | 35.79 35.94 | 40.28 40.31 | //: 32-bit, BCC_v5.51 [asm=335]
+ 100000_ || 34.30 35.27 | 33.66 34.18 | 38.25 39.54 | //: 32-bit, BCC_v5.51 [asm=335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 996 bytes | 1000 bytes | 1068 bytes | //: 32-bit, BCC_v5.51 [asm=335]
+ Block || 3060 bytes | 6300 bytes | 20391 bytes | //: 32-bit, BCC_v5.51 [asm=335]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:32:11,Oct 7 2008 by 'MSC_v9.00', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2586.00 2592.00 | 4896.00 4902.00 | 10668.00 10668.00 | //: 32-bit, MSC_v9.00 [asm=335]
+ 2_ || 1311.00 1317.00 | 2448.00 2451.00 | 5340.00 5343.00 | //: 32-bit, MSC_v9.00 [asm=335]
+ 4_ || 652.50 654.00 | 1224.00 1227.00 | 2665.50 2665.50 | //: 32-bit, MSC_v9.00 [asm=335]
+ 8_ || 327.00 328.50 | 613.50 614.25 | 1332.75 1332.75 | //: 32-bit, MSC_v9.00 [asm=335]
+ 10_ || 263.40 263.40 | 489.60 489.60 | 1069.20 1069.80 | //: 32-bit, MSC_v9.00 [asm=335]
+ 16_ || 163.88 163.88 | 306.00 306.38 | 666.38 666.75 | //: 32-bit, MSC_v9.00 [asm=335]
+ 32_ || 81.00 81.00 | 154.13 154.31 | 334.31 334.50 | //: 32-bit, MSC_v9.00 [asm=335]
+ 64_ || 58.88 58.97 | 76.41 76.59 | 167.16 167.25 | //: 32-bit, MSC_v9.00 [asm=335]
+ 100_ || 61.08 61.14 | 72.30 72.36 | 107.04 107.10 | //: 32-bit, MSC_v9.00 [asm=335]
+ 128_ || 47.81 47.86 | 56.16 56.20 | 83.34 83.34 | //: 32-bit, MSC_v9.00 [asm=335]
+ 256_ || 42.14 42.16 | 45.89 45.89 | 61.64 61.76 | //: 32-bit, MSC_v9.00 [asm=335]
+ 512_ || 36.23 36.23 | 37.66 37.66 | 46.98 46.99 | //: 32-bit, MSC_v9.00 [asm=335]
+ 1000_ || 35.57 35.57 | 36.11 36.12 | 43.07 43.12 | //: 32-bit, MSC_v9.00 [asm=335]
+ 1024_ || 34.85 34.85 | 35.24 35.24 | 42.05 42.06 | //: 32-bit, MSC_v9.00 [asm=335]
+ 2048_ || 34.17 34.17 | 33.88 34.06 | 39.46 39.53 | //: 32-bit, MSC_v9.00 [asm=335]
+ 4096_ || 33.74 33.85 | 33.38 33.46 | 38.32 38.32 | //: 32-bit, MSC_v9.00 [asm=335]
+ 8192_ || 33.65 33.67 | 33.17 34.20 | 37.70 37.71 | //: 32-bit, MSC_v9.00 [asm=335]
+ 10000_ || 33.68 34.51 | 33.29 36.32 | 37.91 39.80 | //: 32-bit, MSC_v9.00 [asm=335]
+ 16384_ || 33.26 35.05 | 32.98 35.06 | 37.34 39.35 | //: 32-bit, MSC_v9.00 [asm=335]
+ 32768_ || 36.34 36.38 | 35.62 35.76 | 40.21 41.08 | //: 32-bit, MSC_v9.00 [asm=335]
+ 100000_ || 36.32 36.43 | 35.91 35.98 | 38.02 38.19 | //: 32-bit, MSC_v9.00 [asm=335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+ Block || 3060 bytes | 6300 bytes | 20391 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+
+Skein performance, in clks per byte, dtMin = 36 clks.
+ [compiled 14:32:16,Oct 7 2008 by 'GCC_v3.42', 32-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 2562.00 2562.00 | 4866.00 4866.00 | 10698.00 10722.00 | //: 32-bit, GCC_v3.42 [asm=335]
+ 2_ || 1269.00 1275.00 | 2436.00 2439.00 | 5343.00 5355.00 | //: 32-bit, GCC_v3.42 [asm=335]
+ 4_ || 645.00 645.00 | 1222.50 1224.00 | 2667.00 2676.00 | //: 32-bit, GCC_v3.42 [asm=335]
+ 8_ || 322.50 323.25 | 610.50 610.50 | 1332.75 1338.00 | //: 32-bit, GCC_v3.42 [asm=335]
+ 10_ || 254.40 255.60 | 486.60 487.20 | 1070.40 1072.20 | //: 32-bit, GCC_v3.42 [asm=335]
+ 16_ || 161.63 162.00 | 306.00 306.00 | 668.25 669.38 | //: 32-bit, GCC_v3.42 [asm=335]
+ 32_ || 73.69 73.69 | 141.56 141.94 | 307.31 307.50 | //: 32-bit, GCC_v3.42 [asm=335]
+ 64_ || 54.28 54.38 | 70.59 70.59 | 153.66 153.75 | //: 32-bit, GCC_v3.42 [asm=335]
+ 100_ || 56.16 56.40 | 66.66 66.66 | 98.40 107.22 | //: 32-bit, GCC_v3.42 [asm=335]
+ 128_ || 47.67 47.81 | 56.16 56.20 | 83.25 83.25 | //: 32-bit, GCC_v3.42 [asm=335]
+ 256_ || 41.72 41.86 | 45.84 45.87 | 61.48 61.52 | //: 32-bit, GCC_v3.42 [asm=335]
+ 512_ || 38.66 38.68 | 40.70 40.70 | 50.68 50.71 | //: 32-bit, GCC_v3.42 [asm=335]
+ 1000_ || 38.09 38.57 | 38.98 38.99 | 46.42 46.43 | //: 32-bit, GCC_v3.42 [asm=335]
+ 1024_ || 37.16 37.17 | 38.10 38.10 | 45.29 45.30 | //: 32-bit, GCC_v3.42 [asm=335]
+ 2048_ || 36.50 36.78 | 36.76 36.76 | 42.45 42.54 | //: 32-bit, GCC_v3.42 [asm=335]
+ 4096_ || 36.23 36.52 | 36.06 36.06 | 41.19 41.21 | //: 32-bit, GCC_v3.42 [asm=335]
+ 8192_ || 33.71 35.10 | 33.02 33.08 | 37.40 37.47 | //: 32-bit, GCC_v3.42 [asm=335]
+ 10000_ || 33.41 33.76 | 33.24 33.24 | 37.68 38.45 | //: 32-bit, GCC_v3.42 [asm=335]
+ 16384_ || 33.63 34.72 | 33.12 35.81 | 40.09 40.41 | //: 32-bit, GCC_v3.42 [asm=335]
+ 32768_ || 33.96 34.18 | 33.53 33.86 | 37.88 38.29 | //: 32-bit, GCC_v3.42 [asm=335]
+ 100000_ || 35.61 36.38 | 34.24 35.18 | 37.98 38.64 | //: 32-bit, GCC_v3.42 [asm=335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 1568 bytes | 1264 bytes | 1472 bytes | //: 32-bit, GCC_v3.42 [asm=335]
+ Block || 3060 bytes | 6300 bytes | 20391 bytes | //: 32-bit, GCC_v3.42 [asm=335]
+
+Skein performance, in clks per byte, dtMin = 24 clks.
+ [compiled 14:32:20,Oct 7 2008 by 'MSC_v9.00', 64-bit]
+ =================================================================
+ || Skein block size |
+ ||--------------------------------------------------------------|
+ Message || 256 bits | 512 bits | 1024 bits |
+ Length ||====================|====================|====================|
+ (bytes) || min median | min median | min median |
+=========||====================|====================|====================|
+ 1_ || 684.00 690.00 | 1104.00 1104.00 | 2022.00 2022.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 2_ || 339.00 342.00 | 549.00 549.00 | 1011.00 1014.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 4_ || 168.00 169.50 | 277.50 277.50 | 505.50 505.50 | //: 64-bit, MSC_v9.00 [asm=335]
+ 8_ || 81.00 81.75 | 135.00 135.00 | 252.00 252.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 10_ || 65.40 65.40 | 109.80 109.80 | 201.60 202.20 | //: 64-bit, MSC_v9.00 [asm=335]
+ 16_ || 40.88 40.88 | 67.13 67.50 | 126.00 126.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 32_ || 20.06 20.25 | 33.56 33.75 | 62.81 63.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 64_ || 14.53 14.63 | 18.19 18.28 | 33.84 33.94 | //: 64-bit, MSC_v9.00 [asm=335]
+ 100_ || 15.78 15.78 | 16.80 16.80 | 22.02 22.08 | //: 64-bit, MSC_v9.00 [asm=335]
+ 128_ || 11.11 11.11 | 11.77 11.77 | 15.61 15.66 | //: 64-bit, MSC_v9.00 [asm=335]
+ 256_ || 9.52 9.52 | 9.07 9.09 | 11.41 11.41 | //: 64-bit, MSC_v9.00 [asm=335]
+ 512_ || 8.63 8.64 | 7.72 7.72 | 9.15 9.16 | //: 64-bit, MSC_v9.00 [asm=335]
+ 1000_ || 8.41 8.42 | 7.21 7.22 | 8.24 8.26 | //: 64-bit, MSC_v9.00 [asm=335]
+ 1024_ || 8.89 8.90 | 7.62 7.63 | 8.08 8.08 | //: 64-bit, MSC_v9.00 [asm=335]
+ 2048_ || 8.00 8.00 | 6.69 7.25 | 7.50 7.50 | //: 64-bit, MSC_v9.00 [asm=335]
+ 4096_ || 7.89 7.89 | 6.52 6.52 | 7.22 7.22 | //: 64-bit, MSC_v9.00 [asm=335]
+ 8192_ || 7.84 7.84 | 6.44 6.44 | 7.07 7.07 | //: 64-bit, MSC_v9.00 [asm=335]
+ 10000_ || 7.84 7.84 | 6.45 6.50 | 7.12 7.12 | //: 64-bit, MSC_v9.00 [asm=335]
+ 16384_ || 7.82 7.82 | 6.40 6.40 | 6.99 7.01 | //: 64-bit, MSC_v9.00 [asm=335]
+ 32768_ || 7.79 7.80 | 6.37 6.37 | 6.96 6.96 | //: 64-bit, MSC_v9.00 [asm=335]
+ 100000_ || 8.11 8.11 | 6.49 6.74 | 6.95 7.26 | //: 64-bit, MSC_v9.00 [asm=335]
+=========||====================|====================|====================|
+Code Size|| | | |
+=========||====================|====================|====================|
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=335]
+ Block || 1288 bytes | 2182 bytes | 7133 bytes | //: 64-bit, MSC_v9.00 [asm=335]
diff --git a/Additional_Implementations/skein_rot_search2.c b/Additional_Implementations/skein_rot_search2.c
new file mode 100644
index 000000000000..a47f5c81d3e3
--- /dev/null
+++ b/Additional_Implementations/skein_rot_search2.c
@@ -0,0 +1,2538 @@
+/***********************************************************************
+**
+** Generate Skein rotation constant candidate sets and test them.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <ctype.h>
+#include <math.h>
+#include <assert.h>
+#include "brg_types.h" /* get Brian Gladman's platform-specific definitions */
+
+#define uint unsigned int
+#define u08b uint_8t
+#define u32b uint_32t
+#define u64b uint_64t
+
+/* Threefish algorithm parameters */
+#ifndef BITS_PER_WORD
+#define BITS_PER_WORD (64) /* number of bits in each word of a Threefish block */
+#endif
+
+#define ROUNDS_PER_CYCLE (8) /* when do we inject keys and start reusing rotation constants? */
+#define MAX_BITS_PER_BLK (1024)
+
+#define MAX_WORDS_PER_BLK (MAX_BITS_PER_BLK/BITS_PER_WORD)
+#define MAX_ROTS_PER_CYCLE (MAX_WORDS_PER_BLK*(ROUNDS_PER_CYCLE/2))
+
+/* default search parameters for different block sizes */
+#define DEFAULT_GEN_CNT_4 (5500)
+#define DEFAULT_ROUND_CNT_4 ( 8)
+#define MIN_HW_OR_4 (50)
+#define MAX_SAT_ROUNDS_4 ( 9)
+
+#define DEFAULT_GEN_CNT_8 (1600)
+#define DEFAULT_ROUND_CNT_8 ( 8)
+#define MIN_HW_OR_8 (36)
+#define MAX_SAT_ROUNDS_8 (10)
+
+#define DEFAULT_GEN_CNT_16 (400) /* the 1024-bit search is slower, so search for fewer iterations :-( */
+#define DEFAULT_ROUND_CNT_16 ( 9)
+#define MIN_HW_OR_16 (40)
+#define MAX_SAT_ROUNDS_16 (11)
+
+#define MAX_ROT_VER_CNT ( 4)
+#define MAX_ROT_VER_MASK ((1 << MAX_ROT_VER_CNT ) - 1)
+
+#define MAX_POP_CNT (1024) /* size of population */
+#define MIN_POP_CNT ( 32)
+#define DEFAULT_POP_CNT (MAX_POP_CNT)
+
+#define ID_RECALC_BIT_NUM (16)
+#define TWIDDLE_CNT_BIT0 (17)
+#define TWIDDLE_CNT_MASK ((1 << TWIDDLE_CNT_BIT0 ) - 1)
+#define ID_RECALC_BIT ( 1 << ID_RECALC_BIT_NUM )
+#define ID_NUM_MASK ((1 << ID_RECALC_BIT_NUM ) - 1)
+
+#if BITS_PER_WORD == 64
+typedef u64b Word;
+#elif BITS_PER_WORD == 32
+typedef u32b Word;
+#else
+#error "Invalid BITS_PER_WORD"
+#endif
+
+/* tstFlag bits */
+#define TST_FLG_SHOW (1u << 0)
+#define TST_FLG_SHOW_HIST (1u << 1)
+#define TST_FLG_VERBOSE (1u << 2)
+#define TST_FLG_STDERR (1u << 3)
+#define TST_FLG_QUICK_EXIT (1u << 4)
+#define TST_FLG_USE_ABS (1u << 5)
+#define TST_FLG_KEEP_MIN_HW (1u << 6)
+#define TST_FLG_WEIGHT_REP (1u << 7)
+#define TST_FLG_CHECK_ONE (1u << 8)
+#define TST_FLG_DO_RAND (1u << 9)
+
+/* parameters for ShowSearchRec */
+#define SHOW_ROTS_FINAL (4)
+#define SHOW_ROTS_H (3)
+#define SHOW_ROTS_PRELIM (2)
+#define SHOW_ROTS (1)
+#define SHOW_NONE (0)
+
+typedef struct { Word x[MAX_WORDS_PER_BLK]; } Block;
+
+typedef void cycle_func(Word *b, const u08b *rotates, int rounds);
+
+typedef struct /* record for dealing with rotation searches */
+ {
+ u08b rotList[MAX_ROTS_PER_CYCLE]; /* rotation constants */
+ uint CRC; /* CRC of rotates[] -- use as a quick "ID" */
+ uint ID; /* (get_rotation index) + (TwiddleCnt << TWIDDLE_CNT_BIT0) */
+ uint parentCRC; /* CRC of the parent (allows us to track genealogy a bit) */
+ uint rWorst; /* "worst" min bit-to-bit differential */
+ u08b hw_OR[MAX_ROT_VER_CNT]; /* min hamming weights (over all words), using OR */
+ } rSearchRec;
+
+typedef struct /* pass a bunch of parameters to RunSearch */
+ {
+ uint tstFlags;
+ uint rounds;
+ uint minHW_or;
+ uint minOffs;
+ uint diffBits;
+ uint genCntMax;
+ uint sampleCnt;
+ uint maxSatRnds;
+ uint seed0;
+ uint rotVerMask;
+ uint popCnt;
+ uint runHours; /* 0 ==> never */
+ uint dupRotMask; /* zero --> allow dup rots within the same round */
+ uint regradeCnt; /* default = 3 */
+ u64b goodRotCntMask; /* which rotation values are ok? */
+ } testParms;
+
+/* globals */
+cycle_func *fwd_cycle = NULL;
+cycle_func *rev_cycle = NULL;
+cycle_func *fwd_cycle_or = NULL; /* slow but steady */
+cycle_func *rev_cycle_or = NULL;
+cycle_func *fwd_cycle_or_rN = NULL; /* optimized for the current # rounds (for speed) */
+cycle_func *rev_cycle_or_rN = NULL;
+const char *rotFileName = NULL; /* read from file instead of generate random? */
+uint bitsPerBlock = 0; /* default is to process all block sizes */
+uint rotsPerCycle;
+uint wordsPerBlock;
+
+/* macro "functions" */
+#define RotCnt_Bad(rotCnt) (((t.goodRotCntMask >> (rotCnt)) & 1) == 0)
+#define left_rot(a,N) (((a) << (N)) | ((a) >> (BITS_PER_WORD - (N))))
+#define right_rot(a,N) (((a) >> (N)) | ((a) << (BITS_PER_WORD - (N))))
+#define DUP_64(w32) ((w32) | (((u64b) (w32)) << 32))
+
+/********************** use RC4 to generate test data ******************/
+/* Note: this works identically on all platforms (big/little-endian) */
+static struct
+ {
+ uint I,J; /* RC4 vars */
+ u08b state[256];
+ } prng;
+
+void RandBytes(void *dst,uint byteCnt)
+ {
+ u08b a,b;
+ u08b *d = (u08b *) dst;
+
+ for (;byteCnt;byteCnt--,d++) /* run RC4 */
+ {
+ prng.I = (prng.I+1) & 0xFF;
+ a = prng.state[prng.I];
+ prng.J = (prng.J+a) & 0xFF;
+ b = prng.state[prng.J];
+ prng.state[prng.I] = b;
+ prng.state[prng.J] = a;
+ *d = prng.state[(a+b) & 0xFF];
+ }
+ }
+
+/* get a pseudo-random 8-bit integer */
+uint Rand08(void)
+ {
+ u08b b;
+ RandBytes(&b,1);
+ return (uint) b;
+ }
+
+/* get a pseudo-random 32-bit integer in a portable way */
+uint Rand32(void)
+ {
+ uint i,n;
+ u08b tmp[sizeof(uint)];
+
+ RandBytes(tmp,sizeof(tmp));
+
+ for (i=n=0;i<sizeof(tmp);i++)
+ n = n*256 + tmp[i];
+
+ return n;
+ }
+
+/* get a pseudo-random 64-bit integer in a portable way */
+u64b Rand64(void)
+ {
+ uint i;
+ u64b n;
+ u08b tmp[sizeof(u64b)];
+
+ RandBytes(tmp,sizeof(tmp));
+
+ n=0;
+ for (i=0;i<sizeof(tmp);i++)
+ n = n*256 + tmp[i];
+
+ return n;
+ }
+
+/* init the (RC4-based) prng */
+void Rand_Init(u64b seed)
+ {
+ uint i,j;
+ u08b tmp[4*256];
+
+ /* init the "key" in an endian-independent fashion */
+ for (i=0;i<8;i++)
+ tmp[i] = (u08b) (seed >> (8*i));
+
+ /* initialize the permutation */
+ for (i=0;i<256;i++)
+ prng.state[i]=(u08b) i;
+
+ /* now run the RC4 key schedule */
+ for (i=j=0;i<256;i++)
+ {
+ j = (j + prng.state[i] + tmp[i%8]) & 0xFF;
+ tmp[256] = prng.state[i];
+ prng.state[i] = prng.state[j];
+ prng.state[j] = tmp[256];
+ }
+ prng.I = prng.J = 0; /* init I,J variables for RC4 */
+
+ /* discard some initial RC4 keystream before returning */
+ RandBytes(tmp,sizeof(tmp));
+ }
+
+/* implementations of Skein round functions for various block sizes */
+void fwd_cycle_16(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds -=8)
+ {
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 0]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 1]); b[ 3] ^= b[ 2];
+ b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[ 2]); b[ 5] ^= b[ 4];
+ b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[ 3]); b[ 7] ^= b[ 6];
+ b[ 8] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[ 4]); b[ 9] ^= b[ 8];
+ b[10] += b[11]; b[11] = left_rot(b[11], rotates[ 5]); b[11] ^= b[10];
+ b[12] += b[13]; b[13] = left_rot(b[13], rotates[ 6]); b[13] ^= b[12];
+ b[14] += b[15]; b[15] = left_rot(b[15], rotates[ 7]); b[15] ^= b[14];
+ if (rounds == 1) break;
+
+ b[ 0] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[ 8]); b[ 9] ^= b[ 0];
+ b[ 2] += b[13]; b[13] = left_rot(b[13], rotates[ 9]); b[13] ^= b[ 2];
+ b[ 6] += b[11]; b[11] = left_rot(b[11], rotates[10]); b[11] ^= b[ 6];
+ b[ 4] += b[15]; b[15] = left_rot(b[15], rotates[11]); b[15] ^= b[ 4];
+ b[10] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[12]); b[ 7] ^= b[10];
+ b[12] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[13]); b[ 3] ^= b[12];
+ b[14] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[14]); b[ 5] ^= b[14];
+ b[ 8] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[15]); b[ 1] ^= b[ 8];
+ if (rounds == 2) break;
+
+ b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[16]); b[ 7] ^= b[ 0];
+ b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[17]); b[ 5] ^= b[ 2];
+ b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[18]); b[ 3] ^= b[ 4];
+ b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[19]); b[ 1] ^= b[ 6];
+ b[12] += b[15]; b[15] = left_rot(b[15], rotates[20]); b[15] ^= b[12];
+ b[14] += b[13]; b[13] = left_rot(b[13], rotates[21]); b[13] ^= b[14];
+ b[ 8] += b[11]; b[11] = left_rot(b[11], rotates[22]); b[11] ^= b[ 8];
+ b[10] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[23]); b[ 9] ^= b[10];
+ if (rounds == 3) break;
+
+ b[ 0] += b[15]; b[15] = left_rot(b[15], rotates[24]); b[15] ^= b[ 0];
+ b[ 2] += b[11]; b[11] = left_rot(b[11], rotates[25]); b[11] ^= b[ 2];
+ b[ 6] += b[13]; b[13] = left_rot(b[13], rotates[26]); b[13] ^= b[ 6];
+ b[ 4] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[27]); b[ 9] ^= b[ 4];
+ b[14] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[28]); b[ 1] ^= b[14];
+ b[ 8] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[29]); b[ 5] ^= b[ 8];
+ b[10] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[30]); b[ 3] ^= b[10];
+ b[12] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[31]); b[ 7] ^= b[12];
+ if (rounds == 4) break;
+
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[32]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[33]); b[ 3] ^= b[ 2];
+ b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[34]); b[ 5] ^= b[ 4];
+ b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[35]); b[ 7] ^= b[ 6];
+ b[ 8] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[36]); b[ 9] ^= b[ 8];
+ b[10] += b[11]; b[11] = left_rot(b[11], rotates[37]); b[11] ^= b[10];
+ b[12] += b[13]; b[13] = left_rot(b[13], rotates[38]); b[13] ^= b[12];
+ b[14] += b[15]; b[15] = left_rot(b[15], rotates[39]); b[15] ^= b[14];
+ if (rounds == 5) break;
+
+ b[ 0] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[40]); b[ 9] ^= b[ 0];
+ b[ 2] += b[13]; b[13] = left_rot(b[13], rotates[41]); b[13] ^= b[ 2];
+ b[ 6] += b[11]; b[11] = left_rot(b[11], rotates[42]); b[11] ^= b[ 6];
+ b[ 4] += b[15]; b[15] = left_rot(b[15], rotates[43]); b[15] ^= b[ 4];
+ b[10] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[44]); b[ 7] ^= b[10];
+ b[12] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[45]); b[ 3] ^= b[12];
+ b[14] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[46]); b[ 5] ^= b[14];
+ b[ 8] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[47]); b[ 1] ^= b[ 8];
+ if (rounds == 6) break;
+
+ b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[48]); b[ 7] ^= b[ 0];
+ b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[49]); b[ 5] ^= b[ 2];
+ b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[50]); b[ 3] ^= b[ 4];
+ b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[51]); b[ 1] ^= b[ 6];
+ b[12] += b[15]; b[15] = left_rot(b[15], rotates[52]); b[15] ^= b[12];
+ b[14] += b[13]; b[13] = left_rot(b[13], rotates[53]); b[13] ^= b[14];
+ b[ 8] += b[11]; b[11] = left_rot(b[11], rotates[54]); b[11] ^= b[ 8];
+ b[10] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[55]); b[ 9] ^= b[10];
+ if (rounds == 7) break;
+
+ b[ 0] += b[15]; b[15] = left_rot(b[15], rotates[56]); b[15] ^= b[ 0];
+ b[ 2] += b[11]; b[11] = left_rot(b[11], rotates[57]); b[11] ^= b[ 2];
+ b[ 6] += b[13]; b[13] = left_rot(b[13], rotates[58]); b[13] ^= b[ 6];
+ b[ 4] += b[ 9]; b[ 9] = left_rot(b[ 9], rotates[59]); b[ 9] ^= b[ 4];
+ b[14] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[60]); b[ 1] ^= b[14];
+ b[ 8] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[61]); b[ 5] ^= b[ 8];
+ b[10] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[62]); b[ 3] ^= b[10];
+ b[12] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[63]); b[ 7] ^= b[12];
+ }
+ }
+
+void fwd_cycle_8(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds -=8)
+ {
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 0]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 1]); b[ 3] ^= b[ 2];
+ b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[ 2]); b[ 5] ^= b[ 4];
+ b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[ 3]); b[ 7] ^= b[ 6];
+ if (rounds == 1) break;
+
+ b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 4]); b[ 1] ^= b[ 2];
+ b[ 4] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[ 5]); b[ 7] ^= b[ 4];
+ b[ 6] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[ 6]); b[ 5] ^= b[ 6];
+ b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 7]); b[ 3] ^= b[ 0];
+ if (rounds == 2) break;
+
+ b[ 4] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 8]); b[ 1] ^= b[ 4];
+ b[ 6] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 9]); b[ 3] ^= b[ 6];
+ b[ 0] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[10]); b[ 5] ^= b[ 0];
+ b[ 2] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[11]); b[ 7] ^= b[ 2];
+ if (rounds == 3) break;
+
+ b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[12]); b[ 1] ^= b[ 6];
+ b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[13]); b[ 7] ^= b[ 0];
+ b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[14]); b[ 5] ^= b[ 2];
+ b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[15]); b[ 3] ^= b[ 4];
+ if (rounds == 4) break;
+
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[16]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[17]); b[ 3] ^= b[ 2];
+ b[ 4] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[18]); b[ 5] ^= b[ 4];
+ b[ 6] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[19]); b[ 7] ^= b[ 6];
+ if (rounds == 5) break;
+
+ b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[20]); b[ 1] ^= b[ 2];
+ b[ 4] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[21]); b[ 7] ^= b[ 4];
+ b[ 6] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[22]); b[ 5] ^= b[ 6];
+ b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[23]); b[ 3] ^= b[ 0];
+ if (rounds == 6) break;
+
+ b[ 4] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[24]); b[ 1] ^= b[ 4];
+ b[ 6] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[25]); b[ 3] ^= b[ 6];
+ b[ 0] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[26]); b[ 5] ^= b[ 0];
+ b[ 2] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[27]); b[ 7] ^= b[ 2];
+ if (rounds == 7) break;
+
+ b[ 6] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[28]); b[ 1] ^= b[ 6];
+ b[ 0] += b[ 7]; b[ 7] = left_rot(b[ 7], rotates[29]); b[ 7] ^= b[ 0];
+ b[ 2] += b[ 5]; b[ 5] = left_rot(b[ 5], rotates[30]); b[ 5] ^= b[ 2];
+ b[ 4] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[31]); b[ 3] ^= b[ 4];
+ }
+ }
+
+void fwd_cycle_4(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds -=8)
+ {
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 0]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 1]); b[ 3] ^= b[ 2];
+ if (rounds == 1) break;
+
+ b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 2]); b[ 3] ^= b[ 0];
+ b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 3]); b[ 1] ^= b[ 2];
+ if (rounds == 2) break;
+
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 4]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 5]); b[ 3] ^= b[ 2];
+ if (rounds == 3) break;
+
+ b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 6]); b[ 3] ^= b[ 0];
+ b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 7]); b[ 1] ^= b[ 2];
+ if (rounds == 4) break;
+
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[ 8]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[ 9]); b[ 3] ^= b[ 2];
+ if (rounds == 5) break;
+
+ b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[10]); b[ 3] ^= b[ 0];
+ b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[11]); b[ 1] ^= b[ 2];
+ if (rounds == 6) break;
+
+ b[ 0] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[12]); b[ 1] ^= b[ 0];
+ b[ 2] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[13]); b[ 3] ^= b[ 2];
+ if (rounds == 7) break;
+
+ b[ 0] += b[ 3]; b[ 3] = left_rot(b[ 3], rotates[14]); b[ 3] ^= b[ 0];
+ b[ 2] += b[ 1]; b[ 1] = left_rot(b[ 1], rotates[15]); b[ 1] ^= b[ 2];
+ }
+ }
+
+/* reverse versions of the cipher */
+void rev_cycle_16(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds = (rounds-1) & ~7)
+ {
+ switch (rounds & 7)
+ {
+ case 0:
+ b[ 7] ^= b[12]; b[ 7] = right_rot(b[ 7], rotates[63]); b[12] -= b[ 7];
+ b[ 3] ^= b[10]; b[ 3] = right_rot(b[ 3], rotates[62]); b[10] -= b[ 3];
+ b[ 5] ^= b[ 8]; b[ 5] = right_rot(b[ 5], rotates[61]); b[ 8] -= b[ 5];
+ b[ 1] ^= b[14]; b[ 1] = right_rot(b[ 1], rotates[60]); b[14] -= b[ 1];
+ b[ 9] ^= b[ 4]; b[ 9] = right_rot(b[ 9], rotates[59]); b[ 4] -= b[ 9];
+ b[13] ^= b[ 6]; b[13] = right_rot(b[13], rotates[58]); b[ 6] -= b[13];
+ b[11] ^= b[ 2]; b[11] = right_rot(b[11], rotates[57]); b[ 2] -= b[11];
+ b[15] ^= b[ 0]; b[15] = right_rot(b[15], rotates[56]); b[ 0] -= b[15];
+ case 7:
+ b[ 9] ^= b[10]; b[ 9] = right_rot(b[ 9], rotates[55]); b[10] -= b[ 9];
+ b[11] ^= b[ 8]; b[11] = right_rot(b[11], rotates[54]); b[ 8] -= b[11];
+ b[13] ^= b[14]; b[13] = right_rot(b[13], rotates[53]); b[14] -= b[13];
+ b[15] ^= b[12]; b[15] = right_rot(b[15], rotates[52]); b[12] -= b[15];
+ b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[51]); b[ 6] -= b[ 1];
+ b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[50]); b[ 4] -= b[ 3];
+ b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[49]); b[ 2] -= b[ 5];
+ b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[48]); b[ 0] -= b[ 7];
+ case 6:
+ b[ 1] ^= b[ 8]; b[ 1] = right_rot(b[ 1], rotates[47]); b[ 8] -= b[ 1];
+ b[ 5] ^= b[14]; b[ 5] = right_rot(b[ 5], rotates[46]); b[14] -= b[ 5];
+ b[ 3] ^= b[12]; b[ 3] = right_rot(b[ 3], rotates[45]); b[12] -= b[ 3];
+ b[ 7] ^= b[10]; b[ 7] = right_rot(b[ 7], rotates[44]); b[10] -= b[ 7];
+ b[15] ^= b[ 4]; b[15] = right_rot(b[15], rotates[43]); b[ 4] -= b[15];
+ b[11] ^= b[ 6]; b[11] = right_rot(b[11], rotates[42]); b[ 6] -= b[11];
+ b[13] ^= b[ 2]; b[13] = right_rot(b[13], rotates[41]); b[ 2] -= b[13];
+ b[ 9] ^= b[ 0]; b[ 9] = right_rot(b[ 9], rotates[40]); b[ 0] -= b[ 9];
+ case 5:
+ b[15] ^= b[14]; b[15] = right_rot(b[15], rotates[39]); b[14] -= b[15];
+ b[13] ^= b[12]; b[13] = right_rot(b[13], rotates[38]); b[12] -= b[13];
+ b[11] ^= b[10]; b[11] = right_rot(b[11], rotates[37]); b[10] -= b[11];
+ b[ 9] ^= b[ 8]; b[ 9] = right_rot(b[ 9], rotates[36]); b[ 8] -= b[ 9];
+ b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[35]); b[ 6] -= b[ 7];
+ b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[34]); b[ 4] -= b[ 5];
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[33]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[32]); b[ 0] -= b[ 1];
+ case 4:
+ b[ 7] ^= b[12]; b[ 7] = right_rot(b[ 7], rotates[31]); b[12] -= b[ 7];
+ b[ 3] ^= b[10]; b[ 3] = right_rot(b[ 3], rotates[30]); b[10] -= b[ 3];
+ b[ 5] ^= b[ 8]; b[ 5] = right_rot(b[ 5], rotates[29]); b[ 8] -= b[ 5];
+ b[ 1] ^= b[14]; b[ 1] = right_rot(b[ 1], rotates[28]); b[14] -= b[ 1];
+ b[ 9] ^= b[ 4]; b[ 9] = right_rot(b[ 9], rotates[27]); b[ 4] -= b[ 9];
+ b[13] ^= b[ 6]; b[13] = right_rot(b[13], rotates[26]); b[ 6] -= b[13];
+ b[11] ^= b[ 2]; b[11] = right_rot(b[11], rotates[25]); b[ 2] -= b[11];
+ b[15] ^= b[ 0]; b[15] = right_rot(b[15], rotates[24]); b[ 0] -= b[15];
+ case 3:
+ b[ 9] ^= b[10]; b[ 9] = right_rot(b[ 9], rotates[23]); b[10] -= b[ 9];
+ b[11] ^= b[ 8]; b[11] = right_rot(b[11], rotates[22]); b[ 8] -= b[11];
+ b[13] ^= b[14]; b[13] = right_rot(b[13], rotates[21]); b[14] -= b[13];
+ b[15] ^= b[12]; b[15] = right_rot(b[15], rotates[20]); b[12] -= b[15];
+ b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[19]); b[ 6] -= b[ 1];
+ b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[18]); b[ 4] -= b[ 3];
+ b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[17]); b[ 2] -= b[ 5];
+ b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[16]); b[ 0] -= b[ 7];
+ case 2:
+ b[ 1] ^= b[ 8]; b[ 1] = right_rot(b[ 1], rotates[15]); b[ 8] -= b[ 1];
+ b[ 5] ^= b[14]; b[ 5] = right_rot(b[ 5], rotates[14]); b[14] -= b[ 5];
+ b[ 3] ^= b[12]; b[ 3] = right_rot(b[ 3], rotates[13]); b[12] -= b[ 3];
+ b[ 7] ^= b[10]; b[ 7] = right_rot(b[ 7], rotates[12]); b[10] -= b[ 7];
+ b[15] ^= b[ 4]; b[15] = right_rot(b[15], rotates[11]); b[ 4] -= b[15];
+ b[11] ^= b[ 6]; b[11] = right_rot(b[11], rotates[10]); b[ 6] -= b[11];
+ b[13] ^= b[ 2]; b[13] = right_rot(b[13], rotates[ 9]); b[ 2] -= b[13];
+ b[ 9] ^= b[ 0]; b[ 9] = right_rot(b[ 9], rotates[ 8]); b[ 0] -= b[ 9];
+ case 1:
+ b[15] ^= b[14]; b[15] = right_rot(b[15], rotates[ 7]); b[14] -= b[15];
+ b[13] ^= b[12]; b[13] = right_rot(b[13], rotates[ 6]); b[12] -= b[13];
+ b[11] ^= b[10]; b[11] = right_rot(b[11], rotates[ 5]); b[10] -= b[11];
+ b[ 9] ^= b[ 8]; b[ 9] = right_rot(b[ 9], rotates[ 4]); b[ 8] -= b[ 9];
+ b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[ 3]); b[ 6] -= b[ 7];
+ b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[ 2]); b[ 4] -= b[ 5];
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 1]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 0]); b[ 0] -= b[ 1];
+ }
+
+ }
+ }
+
+void rev_cycle_8(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds = (rounds-1) & ~7)
+ {
+ switch (rounds & 7)
+ {
+ case 0:
+ b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[31]); b[ 4] -= b[ 3];
+ b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[30]); b[ 2] -= b[ 5];
+ b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[29]); b[ 0] -= b[ 7];
+ b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[28]); b[ 6] -= b[ 1];
+ case 7:
+ b[ 7] ^= b[ 2]; b[ 7] = right_rot(b[ 7], rotates[27]); b[ 2] -= b[ 7];
+ b[ 5] ^= b[ 0]; b[ 5] = right_rot(b[ 5], rotates[26]); b[ 0] -= b[ 5];
+ b[ 3] ^= b[ 6]; b[ 3] = right_rot(b[ 3], rotates[25]); b[ 6] -= b[ 3];
+ b[ 1] ^= b[ 4]; b[ 1] = right_rot(b[ 1], rotates[24]); b[ 4] -= b[ 1];
+ case 6:
+ b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[23]); b[ 0] -= b[ 3];
+ b[ 5] ^= b[ 6]; b[ 5] = right_rot(b[ 5], rotates[22]); b[ 6] -= b[ 5];
+ b[ 7] ^= b[ 4]; b[ 7] = right_rot(b[ 7], rotates[21]); b[ 4] -= b[ 7];
+ b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[20]); b[ 2] -= b[ 1];
+ case 5:
+ b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[19]); b[ 6] -= b[ 7];
+ b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[18]); b[ 4] -= b[ 5];
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[17]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[16]); b[ 0] -= b[ 1];
+ case 4:
+ b[ 3] ^= b[ 4]; b[ 3] = right_rot(b[ 3], rotates[15]); b[ 4] -= b[ 3];
+ b[ 5] ^= b[ 2]; b[ 5] = right_rot(b[ 5], rotates[14]); b[ 2] -= b[ 5];
+ b[ 7] ^= b[ 0]; b[ 7] = right_rot(b[ 7], rotates[13]); b[ 0] -= b[ 7];
+ b[ 1] ^= b[ 6]; b[ 1] = right_rot(b[ 1], rotates[12]); b[ 6] -= b[ 1];
+ case 3:
+ b[ 7] ^= b[ 2]; b[ 7] = right_rot(b[ 7], rotates[11]); b[ 2] -= b[ 7];
+ b[ 5] ^= b[ 0]; b[ 5] = right_rot(b[ 5], rotates[10]); b[ 0] -= b[ 5];
+ b[ 3] ^= b[ 6]; b[ 3] = right_rot(b[ 3], rotates[ 9]); b[ 6] -= b[ 3];
+ b[ 1] ^= b[ 4]; b[ 1] = right_rot(b[ 1], rotates[ 8]); b[ 4] -= b[ 1];
+ case 2:
+ b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[ 7]); b[ 0] -= b[ 3];
+ b[ 5] ^= b[ 6]; b[ 5] = right_rot(b[ 5], rotates[ 6]); b[ 6] -= b[ 5];
+ b[ 7] ^= b[ 4]; b[ 7] = right_rot(b[ 7], rotates[ 5]); b[ 4] -= b[ 7];
+ b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[ 4]); b[ 2] -= b[ 1];
+ case 1:
+ b[ 7] ^= b[ 6]; b[ 7] = right_rot(b[ 7], rotates[ 3]); b[ 6] -= b[ 7];
+ b[ 5] ^= b[ 4]; b[ 5] = right_rot(b[ 5], rotates[ 2]); b[ 4] -= b[ 5];
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 1]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 0]); b[ 0] -= b[ 1];
+ }
+ }
+ }
+
+void rev_cycle_4(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds = (rounds-1) & ~7)
+ {
+ switch (rounds & 7)
+ {
+ case 0:
+ b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[15]); b[ 2] -= b[ 1];
+ b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[14]); b[ 0] -= b[ 3];
+ case 7:
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[13]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[12]); b[ 0] -= b[ 1];
+ case 6:
+ b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[11]); b[ 2] -= b[ 1];
+ b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[10]); b[ 0] -= b[ 3];
+ case 5:
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 9]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 8]); b[ 0] -= b[ 1];
+ case 4:
+ b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[ 7]); b[ 2] -= b[ 1];
+ b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[ 6]); b[ 0] -= b[ 3];
+ case 3:
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 5]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 4]); b[ 0] -= b[ 1];
+ case 2:
+ b[ 1] ^= b[ 2]; b[ 1] = right_rot(b[ 1], rotates[ 3]); b[ 2] -= b[ 1];
+ b[ 3] ^= b[ 0]; b[ 3] = right_rot(b[ 3], rotates[ 2]); b[ 0] -= b[ 3];
+ case 1:
+ b[ 3] ^= b[ 2]; b[ 3] = right_rot(b[ 3], rotates[ 1]); b[ 2] -= b[ 3];
+ b[ 1] ^= b[ 0]; b[ 1] = right_rot(b[ 1], rotates[ 0]); b[ 0] -= b[ 1];
+ }
+ }
+ }
+
+#ifdef TEST_OR /* enable this to simplify testing, since OR is not invertible */
+#define AddOp(I,J) b[I] += b[J]
+#define SubOp(I,J) b[I] -= b[J]
+#define XorOp(I,J) b[I] ^= b[J]
+#else /* this is the "real" OR version */
+#define AddOp(I,J) b[I] |= b[J]
+#define SubOp(I,J) b[I] |= b[J]
+#define XorOp(I,J) b[I] |= b[J]
+#endif
+
+/* "OR" versions of the cipher: replace ADD, XOR with OR */
+void fwd_cycle_16_or(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds -=8)
+ {
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+ AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[ 4]); XorOp( 9, 8);
+ AddOp(10,11); b[11] = left_rot(b[11], rotates[ 5]); XorOp(11,10);
+ AddOp(12,13); b[13] = left_rot(b[13], rotates[ 6]); XorOp(13,12);
+ AddOp(14,15); b[15] = left_rot(b[15], rotates[ 7]); XorOp(15,14);
+ if (rounds == 1) break;
+
+ AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[ 8]); XorOp( 9, 0);
+ AddOp( 2,13); b[13] = left_rot(b[13], rotates[ 9]); XorOp(13, 2);
+ AddOp( 6,11); b[11] = left_rot(b[11], rotates[10]); XorOp(11, 6);
+ AddOp( 4,15); b[15] = left_rot(b[15], rotates[11]); XorOp(15, 4);
+ AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[12]); XorOp( 7,10);
+ AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3,12);
+ AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5,14);
+ AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 8);
+ if (rounds == 2) break;
+
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[16]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[17]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[18]); XorOp( 3, 4);
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[19]); XorOp( 1, 6);
+ AddOp(12,15); b[15] = left_rot(b[15], rotates[20]); XorOp(15,12);
+ AddOp(14,13); b[13] = left_rot(b[13], rotates[21]); XorOp(13,14);
+ AddOp( 8,11); b[11] = left_rot(b[11], rotates[22]); XorOp(11, 8);
+ AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[23]); XorOp( 9,10);
+ if (rounds == 3) break;
+
+ AddOp( 0,15); b[15] = left_rot(b[15], rotates[24]); XorOp(15, 0);
+ AddOp( 2,11); b[11] = left_rot(b[11], rotates[25]); XorOp(11, 2);
+ AddOp( 6,13); b[13] = left_rot(b[13], rotates[26]); XorOp(13, 6);
+ AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[27]); XorOp( 9, 4);
+ AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1,14);
+ AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[29]); XorOp( 5, 8);
+ AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[30]); XorOp( 3,10);
+ AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[31]); XorOp( 7,12);
+ if (rounds == 4) break;
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[32]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[33]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[34]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[35]); XorOp( 7, 6);
+ AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[36]); XorOp( 9, 8);
+ AddOp(10,11); b[11] = left_rot(b[11], rotates[37]); XorOp(11,10);
+ AddOp(12,13); b[13] = left_rot(b[13], rotates[38]); XorOp(13,12);
+ AddOp(14,15); b[15] = left_rot(b[15], rotates[39]); XorOp(15,14);
+ if (rounds == 5) break;
+
+ AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[40]); XorOp( 9, 0);
+ AddOp( 2,13); b[13] = left_rot(b[13], rotates[41]); XorOp(13, 2);
+ AddOp( 6,11); b[11] = left_rot(b[11], rotates[42]); XorOp(11, 6);
+ AddOp( 4,15); b[15] = left_rot(b[15], rotates[43]); XorOp(15, 4);
+ AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[44]); XorOp( 7,10);
+ AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[45]); XorOp( 3,12);
+ AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[46]); XorOp( 5,14);
+ AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[47]); XorOp( 1, 8);
+ if (rounds == 6) break;
+
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[48]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[49]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[50]); XorOp( 3, 4);
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[51]); XorOp( 1, 6);
+ AddOp(12,15); b[15] = left_rot(b[15], rotates[52]); XorOp(15,12);
+ AddOp(14,13); b[13] = left_rot(b[13], rotates[53]); XorOp(13,14);
+ AddOp( 8,11); b[11] = left_rot(b[11], rotates[54]); XorOp(11, 8);
+ AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[55]); XorOp( 9,10);
+ if (rounds == 7) break;
+
+ AddOp( 0,15); b[15] = left_rot(b[15], rotates[56]); XorOp(15, 0);
+ AddOp( 2,11); b[11] = left_rot(b[11], rotates[57]); XorOp(11, 2);
+ AddOp( 6,13); b[13] = left_rot(b[13], rotates[58]); XorOp(13, 6);
+ AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[59]); XorOp( 9, 4);
+ AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[60]); XorOp( 1,14);
+ AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[61]); XorOp( 5, 8);
+ AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[62]); XorOp( 3,10);
+ AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[63]); XorOp( 7,12);
+ }
+ }
+
+void fwd_cycle_8_or(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds -=8)
+ {
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+ if (rounds == 1) break;
+
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 2);
+ AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[ 5]); XorOp( 7, 4);
+ AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[ 6]); XorOp( 5, 6);
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 7]); XorOp( 3, 0);
+ if (rounds == 2) break;
+
+ AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 4);
+ AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 6);
+ AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[10]); XorOp( 5, 0);
+ AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[11]); XorOp( 7, 2);
+ if (rounds == 3) break;
+
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 6);
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[13]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[15]); XorOp( 3, 4);
+ if (rounds == 4) break;
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[16]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[17]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[18]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[19]); XorOp( 7, 6);
+ if (rounds == 5) break;
+
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[20]); XorOp( 1, 2);
+ AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[21]); XorOp( 7, 4);
+ AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[22]); XorOp( 5, 6);
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[23]); XorOp( 3, 0);
+ if (rounds == 6) break;
+
+ AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[24]); XorOp( 1, 4);
+ AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[25]); XorOp( 3, 6);
+ AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[26]); XorOp( 5, 0);
+ AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[27]); XorOp( 7, 2);
+ if (rounds == 7) break;
+
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1, 6);
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[29]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[30]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[31]); XorOp( 3, 4);
+ }
+ }
+
+void fwd_cycle_4_or(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds -=8)
+ {
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+ if (rounds == 1) break;
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 2]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 3]); XorOp( 1, 2);
+ if (rounds == 2) break;
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 5]); XorOp( 3, 2);
+ if (rounds == 3) break;
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 6]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 7]); XorOp( 1, 2);
+ if (rounds == 4) break;
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 2);
+ if (rounds == 5) break;
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[10]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[11]); XorOp( 1, 2);
+ if (rounds == 6) break;
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3, 2);
+ if (rounds == 7) break;
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[14]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 2);
+ }
+ }
+
+/* reverse versions of the cipher, using OR */
+void rev_cycle_16_or(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds = (rounds-1) & ~7)
+ {
+ switch (rounds & 7)
+ {
+ case 0:
+ XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[63]); SubOp(12, 7);
+ XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[62]); SubOp(10, 3);
+ XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[61]); SubOp( 8, 5);
+ XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[60]); SubOp(14, 1);
+ XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[59]); SubOp( 4, 9);
+ XorOp(13, 6); b[13] = right_rot(b[13], rotates[58]); SubOp( 6,13);
+ XorOp(11, 2); b[11] = right_rot(b[11], rotates[57]); SubOp( 2,11);
+ XorOp(15, 0); b[15] = right_rot(b[15], rotates[56]); SubOp( 0,15);
+ case 7:
+ XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[55]); SubOp(10, 9);
+ XorOp(11, 8); b[11] = right_rot(b[11], rotates[54]); SubOp( 8,11);
+ XorOp(13,14); b[13] = right_rot(b[13], rotates[53]); SubOp(14,13);
+ XorOp(15,12); b[15] = right_rot(b[15], rotates[52]); SubOp(12,15);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[51]); SubOp( 6, 1);
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[50]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[49]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[48]); SubOp( 0, 7);
+ case 6:
+ XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[47]); SubOp( 8, 1);
+ XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[46]); SubOp(14, 5);
+ XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[45]); SubOp(12, 3);
+ XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[44]); SubOp(10, 7);
+ XorOp(15, 4); b[15] = right_rot(b[15], rotates[43]); SubOp( 4,15);
+ XorOp(11, 6); b[11] = right_rot(b[11], rotates[42]); SubOp( 6,11);
+ XorOp(13, 2); b[13] = right_rot(b[13], rotates[41]); SubOp( 2,13);
+ XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[40]); SubOp( 0, 9);
+ case 5:
+ XorOp(15,14); b[15] = right_rot(b[15], rotates[39]); SubOp(14,15);
+ XorOp(13,12); b[13] = right_rot(b[13], rotates[38]); SubOp(12,13);
+ XorOp(11,10); b[11] = right_rot(b[11], rotates[37]); SubOp(10,11);
+ XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[36]); SubOp( 8, 9);
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[35]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[34]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[33]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[32]); SubOp( 0, 1);
+ case 4:
+ XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[31]); SubOp(12, 7);
+ XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[30]); SubOp(10, 3);
+ XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[29]); SubOp( 8, 5);
+ XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp(14, 1);
+ XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[27]); SubOp( 4, 9);
+ XorOp(13, 6); b[13] = right_rot(b[13], rotates[26]); SubOp( 6,13);
+ XorOp(11, 2); b[11] = right_rot(b[11], rotates[25]); SubOp( 2,11);
+ XorOp(15, 0); b[15] = right_rot(b[15], rotates[24]); SubOp( 0,15);
+ case 3:
+ XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[23]); SubOp(10, 9);
+ XorOp(11, 8); b[11] = right_rot(b[11], rotates[22]); SubOp( 8,11);
+ XorOp(13,14); b[13] = right_rot(b[13], rotates[21]); SubOp(14,13);
+ XorOp(15,12); b[15] = right_rot(b[15], rotates[20]); SubOp(12,15);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[19]); SubOp( 6, 1);
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[18]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[17]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[16]); SubOp( 0, 7);
+ case 2:
+ XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 8, 1);
+ XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp(14, 5);
+ XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp(12, 3);
+ XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[12]); SubOp(10, 7);
+ XorOp(15, 4); b[15] = right_rot(b[15], rotates[11]); SubOp( 4,15);
+ XorOp(11, 6); b[11] = right_rot(b[11], rotates[10]); SubOp( 6,11);
+ XorOp(13, 2); b[13] = right_rot(b[13], rotates[ 9]); SubOp( 2,13);
+ XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[ 8]); SubOp( 0, 9);
+ case 1:
+ XorOp(15,14); b[15] = right_rot(b[15], rotates[ 7]); SubOp(14,15);
+ XorOp(13,12); b[13] = right_rot(b[13], rotates[ 6]); SubOp(12,13);
+ XorOp(11,10); b[11] = right_rot(b[11], rotates[ 5]); SubOp(10,11);
+ XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[ 4]); SubOp( 8, 9);
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+ }
+
+ }
+ }
+
+void rev_cycle_8_or(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds = (rounds-1) & ~7)
+ {
+ switch (rounds & 7)
+ {
+ case 0:
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[31]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[30]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[29]); SubOp( 0, 7);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp( 6, 1);
+ case 7:
+ XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[27]); SubOp( 2, 7);
+ XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[26]); SubOp( 0, 5);
+ XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[25]); SubOp( 6, 3);
+ XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[24]); SubOp( 4, 1);
+ case 6:
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[23]); SubOp( 0, 3);
+ XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[22]); SubOp( 6, 5);
+ XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[21]); SubOp( 4, 7);
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[20]); SubOp( 2, 1);
+ case 5:
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[19]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[18]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[17]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[16]); SubOp( 0, 1);
+ case 4:
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[15]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[13]); SubOp( 0, 7);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 6, 1);
+ case 3:
+ XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[11]); SubOp( 2, 7);
+ XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[10]); SubOp( 0, 5);
+ XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 6, 3);
+ XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 4, 1);
+ case 2:
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 7]); SubOp( 0, 3);
+ XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[ 6]); SubOp( 6, 5);
+ XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[ 5]); SubOp( 4, 7);
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 2, 1);
+ case 1:
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+ }
+ }
+ }
+
+void rev_cycle_4_or(Word *b, const u08b *rotates, int rounds)
+ {
+ for (;rounds > 0;rounds = (rounds-1) & ~7)
+ {
+ switch (rounds & 7)
+ {
+ case 0:
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[14]); SubOp( 0, 3);
+ case 7:
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 0, 1);
+ case 6:
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[11]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[10]); SubOp( 0, 3);
+ case 5:
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 0, 1);
+ case 4:
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 7]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 6]); SubOp( 0, 3);
+ case 3:
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 5]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 0, 1);
+ case 2:
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 3]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 2]); SubOp( 0, 3);
+ case 1:
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+ }
+ }
+ }
+
+/* optimized versions for default round counts */
+#if defined(__BORLANDC__)
+#pragma argsused
+#elif defined(_MSC_VER)
+#pragma warning(disable:4100)
+#endif
+void fwd_cycle_16_or_r9(Word *b, const u08b *rotates, int rounds)
+ {
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+ AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[ 4]); XorOp( 9, 8);
+ AddOp(10,11); b[11] = left_rot(b[11], rotates[ 5]); XorOp(11,10);
+ AddOp(12,13); b[13] = left_rot(b[13], rotates[ 6]); XorOp(13,12);
+ AddOp(14,15); b[15] = left_rot(b[15], rotates[ 7]); XorOp(15,14);
+
+ AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[ 8]); XorOp( 9, 0);
+ AddOp( 2,13); b[13] = left_rot(b[13], rotates[ 9]); XorOp(13, 2);
+ AddOp( 6,11); b[11] = left_rot(b[11], rotates[10]); XorOp(11, 6);
+ AddOp( 4,15); b[15] = left_rot(b[15], rotates[11]); XorOp(15, 4);
+ AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[12]); XorOp( 7,10);
+ AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3,12);
+ AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5,14);
+ AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 8);
+
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[16]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[17]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[18]); XorOp( 3, 4);
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[19]); XorOp( 1, 6);
+ AddOp(12,15); b[15] = left_rot(b[15], rotates[20]); XorOp(15,12);
+ AddOp(14,13); b[13] = left_rot(b[13], rotates[21]); XorOp(13,14);
+ AddOp( 8,11); b[11] = left_rot(b[11], rotates[22]); XorOp(11, 8);
+ AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[23]); XorOp( 9,10);
+
+ AddOp( 0,15); b[15] = left_rot(b[15], rotates[24]); XorOp(15, 0);
+ AddOp( 2,11); b[11] = left_rot(b[11], rotates[25]); XorOp(11, 2);
+ AddOp( 6,13); b[13] = left_rot(b[13], rotates[26]); XorOp(13, 6);
+ AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[27]); XorOp( 9, 4);
+ AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1,14);
+ AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[29]); XorOp( 5, 8);
+ AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[30]); XorOp( 3,10);
+ AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[31]); XorOp( 7,12);
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[32]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[33]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[34]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[35]); XorOp( 7, 6);
+ AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[36]); XorOp( 9, 8);
+ AddOp(10,11); b[11] = left_rot(b[11], rotates[37]); XorOp(11,10);
+ AddOp(12,13); b[13] = left_rot(b[13], rotates[38]); XorOp(13,12);
+ AddOp(14,15); b[15] = left_rot(b[15], rotates[39]); XorOp(15,14);
+
+ AddOp( 0, 9); b[ 9] = left_rot(b[ 9], rotates[40]); XorOp( 9, 0);
+ AddOp( 2,13); b[13] = left_rot(b[13], rotates[41]); XorOp(13, 2);
+ AddOp( 6,11); b[11] = left_rot(b[11], rotates[42]); XorOp(11, 6);
+ AddOp( 4,15); b[15] = left_rot(b[15], rotates[43]); XorOp(15, 4);
+ AddOp(10, 7); b[ 7] = left_rot(b[ 7], rotates[44]); XorOp( 7,10);
+ AddOp(12, 3); b[ 3] = left_rot(b[ 3], rotates[45]); XorOp( 3,12);
+ AddOp(14, 5); b[ 5] = left_rot(b[ 5], rotates[46]); XorOp( 5,14);
+ AddOp( 8, 1); b[ 1] = left_rot(b[ 1], rotates[47]); XorOp( 1, 8);
+
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[48]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[49]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[50]); XorOp( 3, 4);
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[51]); XorOp( 1, 6);
+ AddOp(12,15); b[15] = left_rot(b[15], rotates[52]); XorOp(15,12);
+ AddOp(14,13); b[13] = left_rot(b[13], rotates[53]); XorOp(13,14);
+ AddOp( 8,11); b[11] = left_rot(b[11], rotates[54]); XorOp(11, 8);
+ AddOp(10, 9); b[ 9] = left_rot(b[ 9], rotates[55]); XorOp( 9,10);
+
+ AddOp( 0,15); b[15] = left_rot(b[15], rotates[56]); XorOp(15, 0);
+ AddOp( 2,11); b[11] = left_rot(b[11], rotates[57]); XorOp(11, 2);
+ AddOp( 6,13); b[13] = left_rot(b[13], rotates[58]); XorOp(13, 6);
+ AddOp( 4, 9); b[ 9] = left_rot(b[ 9], rotates[59]); XorOp( 9, 4);
+ AddOp(14, 1); b[ 1] = left_rot(b[ 1], rotates[60]); XorOp( 1,14);
+ AddOp( 8, 5); b[ 5] = left_rot(b[ 5], rotates[61]); XorOp( 5, 8);
+ AddOp(10, 3); b[ 3] = left_rot(b[ 3], rotates[62]); XorOp( 3,10);
+ AddOp(12, 7); b[ 7] = left_rot(b[ 7], rotates[63]); XorOp( 7,12);
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+ AddOp( 8, 9); b[ 9] = left_rot(b[ 9], rotates[ 4]); XorOp( 9, 8);
+ AddOp(10,11); b[11] = left_rot(b[11], rotates[ 5]); XorOp(11,10);
+ AddOp(12,13); b[13] = left_rot(b[13], rotates[ 6]); XorOp(13,12);
+ AddOp(14,15); b[15] = left_rot(b[15], rotates[ 7]); XorOp(15,14);
+ }
+
+#if defined(__BORLANDC__)
+#pragma argsused
+#endif
+void fwd_cycle_8_or_r8(Word *b, const u08b *rotates, int rounds)
+ {
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[ 2]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[ 3]); XorOp( 7, 6);
+
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 2);
+ AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[ 5]); XorOp( 7, 4);
+ AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[ 6]); XorOp( 5, 6);
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 7]); XorOp( 3, 0);
+
+ AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 4);
+ AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 6);
+ AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[10]); XorOp( 5, 0);
+ AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[11]); XorOp( 7, 2);
+
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 6);
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[13]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[14]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[15]); XorOp( 3, 4);
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[16]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[17]); XorOp( 3, 2);
+ AddOp( 4, 5); b[ 5] = left_rot(b[ 5], rotates[18]); XorOp( 5, 4);
+ AddOp( 6, 7); b[ 7] = left_rot(b[ 7], rotates[19]); XorOp( 7, 6);
+
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[20]); XorOp( 1, 2);
+ AddOp( 4, 7); b[ 7] = left_rot(b[ 7], rotates[21]); XorOp( 7, 4);
+ AddOp( 6, 5); b[ 5] = left_rot(b[ 5], rotates[22]); XorOp( 5, 6);
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[23]); XorOp( 3, 0);
+
+ AddOp( 4, 1); b[ 1] = left_rot(b[ 1], rotates[24]); XorOp( 1, 4);
+ AddOp( 6, 3); b[ 3] = left_rot(b[ 3], rotates[25]); XorOp( 3, 6);
+ AddOp( 0, 5); b[ 5] = left_rot(b[ 5], rotates[26]); XorOp( 5, 0);
+ AddOp( 2, 7); b[ 7] = left_rot(b[ 7], rotates[27]); XorOp( 7, 2);
+
+ AddOp( 6, 1); b[ 1] = left_rot(b[ 1], rotates[28]); XorOp( 1, 6);
+ AddOp( 0, 7); b[ 7] = left_rot(b[ 7], rotates[29]); XorOp( 7, 0);
+ AddOp( 2, 5); b[ 5] = left_rot(b[ 5], rotates[30]); XorOp( 5, 2);
+ AddOp( 4, 3); b[ 3] = left_rot(b[ 3], rotates[31]); XorOp( 3, 4);
+ }
+
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void fwd_cycle_4_or_r8(Word *b, const u08b *rotates, int rounds)
+ {
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 0]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 1]); XorOp( 3, 2);
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 2]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 3]); XorOp( 1, 2);
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 4]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 5]); XorOp( 3, 2);
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[ 6]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[ 7]); XorOp( 1, 2);
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[ 8]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[ 9]); XorOp( 3, 2);
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[10]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[11]); XorOp( 1, 2);
+
+ AddOp( 0, 1); b[ 1] = left_rot(b[ 1], rotates[12]); XorOp( 1, 0);
+ AddOp( 2, 3); b[ 3] = left_rot(b[ 3], rotates[13]); XorOp( 3, 2);
+
+ AddOp( 0, 3); b[ 3] = left_rot(b[ 3], rotates[14]); XorOp( 3, 0);
+ AddOp( 2, 1); b[ 1] = left_rot(b[ 1], rotates[15]); XorOp( 1, 2);
+ }
+
+/* reverse versions of the cipher, using OR, for fixed round numbers */
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void rev_cycle_16_or_r9(Word *b, const u08b *rotates, int rounds)
+ {
+ XorOp(15,14); b[15] = right_rot(b[15], rotates[ 7]); SubOp(14,15);
+ XorOp(13,12); b[13] = right_rot(b[13], rotates[ 6]); SubOp(12,13);
+ XorOp(11,10); b[11] = right_rot(b[11], rotates[ 5]); SubOp(10,11);
+ XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[ 4]); SubOp( 8, 9);
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+
+ XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[63]); SubOp(12, 7);
+ XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[62]); SubOp(10, 3);
+ XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[61]); SubOp( 8, 5);
+ XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[60]); SubOp(14, 1);
+ XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[59]); SubOp( 4, 9);
+ XorOp(13, 6); b[13] = right_rot(b[13], rotates[58]); SubOp( 6,13);
+ XorOp(11, 2); b[11] = right_rot(b[11], rotates[57]); SubOp( 2,11);
+ XorOp(15, 0); b[15] = right_rot(b[15], rotates[56]); SubOp( 0,15);
+
+ XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[55]); SubOp(10, 9);
+ XorOp(11, 8); b[11] = right_rot(b[11], rotates[54]); SubOp( 8,11);
+ XorOp(13,14); b[13] = right_rot(b[13], rotates[53]); SubOp(14,13);
+ XorOp(15,12); b[15] = right_rot(b[15], rotates[52]); SubOp(12,15);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[51]); SubOp( 6, 1);
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[50]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[49]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[48]); SubOp( 0, 7);
+
+ XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[47]); SubOp( 8, 1);
+ XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[46]); SubOp(14, 5);
+ XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[45]); SubOp(12, 3);
+ XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[44]); SubOp(10, 7);
+ XorOp(15, 4); b[15] = right_rot(b[15], rotates[43]); SubOp( 4,15);
+ XorOp(11, 6); b[11] = right_rot(b[11], rotates[42]); SubOp( 6,11);
+ XorOp(13, 2); b[13] = right_rot(b[13], rotates[41]); SubOp( 2,13);
+ XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[40]); SubOp( 0, 9);
+
+ XorOp(15,14); b[15] = right_rot(b[15], rotates[39]); SubOp(14,15);
+ XorOp(13,12); b[13] = right_rot(b[13], rotates[38]); SubOp(12,13);
+ XorOp(11,10); b[11] = right_rot(b[11], rotates[37]); SubOp(10,11);
+ XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[36]); SubOp( 8, 9);
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[35]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[34]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[33]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[32]); SubOp( 0, 1);
+
+ XorOp( 7,12); b[ 7] = right_rot(b[ 7], rotates[31]); SubOp(12, 7);
+ XorOp( 3,10); b[ 3] = right_rot(b[ 3], rotates[30]); SubOp(10, 3);
+ XorOp( 5, 8); b[ 5] = right_rot(b[ 5], rotates[29]); SubOp( 8, 5);
+ XorOp( 1,14); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp(14, 1);
+ XorOp( 9, 4); b[ 9] = right_rot(b[ 9], rotates[27]); SubOp( 4, 9);
+ XorOp(13, 6); b[13] = right_rot(b[13], rotates[26]); SubOp( 6,13);
+ XorOp(11, 2); b[11] = right_rot(b[11], rotates[25]); SubOp( 2,11);
+ XorOp(15, 0); b[15] = right_rot(b[15], rotates[24]); SubOp( 0,15);
+
+ XorOp( 9,10); b[ 9] = right_rot(b[ 9], rotates[23]); SubOp(10, 9);
+ XorOp(11, 8); b[11] = right_rot(b[11], rotates[22]); SubOp( 8,11);
+ XorOp(13,14); b[13] = right_rot(b[13], rotates[21]); SubOp(14,13);
+ XorOp(15,12); b[15] = right_rot(b[15], rotates[20]); SubOp(12,15);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[19]); SubOp( 6, 1);
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[18]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[17]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[16]); SubOp( 0, 7);
+
+ XorOp( 1, 8); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 8, 1);
+ XorOp( 5,14); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp(14, 5);
+ XorOp( 3,12); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp(12, 3);
+ XorOp( 7,10); b[ 7] = right_rot(b[ 7], rotates[12]); SubOp(10, 7);
+ XorOp(15, 4); b[15] = right_rot(b[15], rotates[11]); SubOp( 4,15);
+ XorOp(11, 6); b[11] = right_rot(b[11], rotates[10]); SubOp( 6,11);
+ XorOp(13, 2); b[13] = right_rot(b[13], rotates[ 9]); SubOp( 2,13);
+ XorOp( 9, 0); b[ 9] = right_rot(b[ 9], rotates[ 8]); SubOp( 0, 9);
+
+ XorOp(15,14); b[15] = right_rot(b[15], rotates[ 7]); SubOp(14,15);
+ XorOp(13,12); b[13] = right_rot(b[13], rotates[ 6]); SubOp(12,13);
+ XorOp(11,10); b[11] = right_rot(b[11], rotates[ 5]); SubOp(10,11);
+ XorOp( 9, 8); b[ 9] = right_rot(b[ 9], rotates[ 4]); SubOp( 8, 9);
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+ }
+
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void rev_cycle_8_or_r8(Word *b, const u08b *rotates, int rounds)
+ {
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[31]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[30]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[29]); SubOp( 0, 7);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[28]); SubOp( 6, 1);
+
+ XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[27]); SubOp( 2, 7);
+ XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[26]); SubOp( 0, 5);
+ XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[25]); SubOp( 6, 3);
+ XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[24]); SubOp( 4, 1);
+
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[23]); SubOp( 0, 3);
+ XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[22]); SubOp( 6, 5);
+ XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[21]); SubOp( 4, 7);
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[20]); SubOp( 2, 1);
+
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[19]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[18]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[17]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[16]); SubOp( 0, 1);
+
+ XorOp( 3, 4); b[ 3] = right_rot(b[ 3], rotates[15]); SubOp( 4, 3);
+ XorOp( 5, 2); b[ 5] = right_rot(b[ 5], rotates[14]); SubOp( 2, 5);
+ XorOp( 7, 0); b[ 7] = right_rot(b[ 7], rotates[13]); SubOp( 0, 7);
+ XorOp( 1, 6); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 6, 1);
+
+ XorOp( 7, 2); b[ 7] = right_rot(b[ 7], rotates[11]); SubOp( 2, 7);
+ XorOp( 5, 0); b[ 5] = right_rot(b[ 5], rotates[10]); SubOp( 0, 5);
+ XorOp( 3, 6); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 6, 3);
+ XorOp( 1, 4); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 4, 1);
+
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 7]); SubOp( 0, 3);
+ XorOp( 5, 6); b[ 5] = right_rot(b[ 5], rotates[ 6]); SubOp( 6, 5);
+ XorOp( 7, 4); b[ 7] = right_rot(b[ 7], rotates[ 5]); SubOp( 4, 7);
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 2, 1);
+
+ XorOp( 7, 6); b[ 7] = right_rot(b[ 7], rotates[ 3]); SubOp( 6, 7);
+ XorOp( 5, 4); b[ 5] = right_rot(b[ 5], rotates[ 2]); SubOp( 4, 5);
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+ }
+
+#ifdef __BORLANDC__
+#pragma argsused
+#endif
+void rev_cycle_4_or_r8(Word *b, const u08b *rotates, int rounds)
+ {
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[15]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[14]); SubOp( 0, 3);
+
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[13]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[12]); SubOp( 0, 1);
+
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[11]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[10]); SubOp( 0, 3);
+
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 9]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 8]); SubOp( 0, 1);
+
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 7]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 6]); SubOp( 0, 3);
+
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 5]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 4]); SubOp( 0, 1);
+
+ XorOp( 1, 2); b[ 1] = right_rot(b[ 1], rotates[ 3]); SubOp( 2, 1);
+ XorOp( 3, 0); b[ 3] = right_rot(b[ 3], rotates[ 2]); SubOp( 0, 3);
+
+ XorOp( 3, 2); b[ 3] = right_rot(b[ 3], rotates[ 1]); SubOp( 2, 3);
+ XorOp( 1, 0); b[ 1] = right_rot(b[ 1], rotates[ 0]); SubOp( 0, 1);
+ }
+
+
+/* test that fwd and rev ciphers are truly inverses */
+void InverseChecks(void)
+ {
+ uint i,j,k,wCnt,tstCnt;
+ int r,rN;
+ Block pt,ct,xt;
+ u08b rots[MAX_ROTS_PER_CYCLE];
+ uint TEST_CNT = (sizeof(size_t) == 8) ? 64 : 8;
+
+ cycle_func *fwd;
+ cycle_func *rev;
+ cycle_func *fwd_or;
+ cycle_func *fwd_or_rN;
+#ifdef TEST_OR
+ cycle_func *rev_or;
+ cycle_func *rev_or_rN;
+#endif
+
+ Rand_Init(0);
+ for (wCnt=4;wCnt<=MAX_WORDS_PER_BLK;wCnt *= 2)
+ {
+ switch (wCnt)
+ {
+ case 4: fwd = fwd_cycle_4 ; rev = rev_cycle_4 ;
+ fwd_or = fwd_cycle_4_or ; fwd_or_rN = fwd_cycle_4_or_r8 ; break;
+ case 8: fwd = fwd_cycle_8 ; rev = rev_cycle_8 ;
+ fwd_or = fwd_cycle_8_or ; fwd_or_rN = fwd_cycle_8_or_r8 ; break;
+ default: fwd = fwd_cycle_16 ; rev = rev_cycle_16 ;
+ fwd_or = fwd_cycle_16_or ; fwd_or_rN = fwd_cycle_16_or_r9 ; break;
+ }
+#ifdef TEST_OR
+ switch (wCnt)
+ {
+ case 4: rev_or_rN = rev_cycle_4_or_r8 ; rev_or = rev_cycle_4_or ; break;
+ case 8: rev_or_rN = rev_cycle_8_or_r8 ; rev_or = rev_cycle_8_or ; break;
+ default: rev_or_rN = rev_cycle_16_or_r9 ; rev_or = rev_cycle_16_or ; break;
+ }
+#endif
+ for (tstCnt=0;tstCnt<TEST_CNT;tstCnt++)
+ {
+ if (tstCnt == 0)
+ {
+ memset(pt.x,0,sizeof(pt)); /* make the first test simple, for debug */
+ pt.x[0]++;
+ }
+ else
+ RandBytes(pt.x,wCnt*sizeof(pt.x[0]));
+
+ RandBytes(rots,sizeof(rots)); /* use random rotation constants */
+ for (i=0;i<MAX_ROTS_PER_CYCLE;i++)
+ rots[i] &= (BITS_PER_WORD-1);
+ for (r=1;r<32;r++)
+ {
+ ct=pt;
+ rev(ct.x,rots,r);
+ fwd(ct.x,rots,r);
+ if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+ {
+ printf("Inverse failure: #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+ exit(8);
+ }
+ fwd(ct.x,rots,r);
+ rev(ct.x,rots,r);
+ if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+ {
+ printf("Inverse failure: #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+ exit(8);
+ }
+#ifdef TEST_OR
+ fwd_or(ct.x,rots,r);
+ rev (ct.x,rots,r);
+ if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+ {
+ printf("Inverse failure (fwd_or): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+ exit(8);
+ }
+ fwd (ct.x,rots,r);
+ rev_or(ct.x,rots,r);
+ if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+ {
+ printf("Inverse failure (rev_or): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+ exit(8);
+ }
+ if (r != ((wCnt == 16) ? 9 : 8))
+ continue;
+ fwd_or_rN(ct.x,rots,r);
+ rev (ct.x,rots,r);
+ if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+ {
+ printf("Inverse failure (fwd_or_rN): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+ exit(8);
+ }
+ fwd (ct.x,rots,r);
+ rev_or_rN(ct.x,rots,r);
+ if (memcmp(pt.x,ct.x,wCnt*sizeof(pt.x[0])))
+ {
+ printf("Inverse failure (rev_or_rN): #%03d: wCnt=%d. r=%2d",tstCnt,wCnt,r);
+ exit(8);
+ }
+#else
+ /* validate that "quick" Hamming weight checks are ok, using OR */
+ for (i=0;i<wCnt;i++)
+ {
+ memset(ct.x,0,sizeof(ct.x));
+ ct.x[i]=1;
+ fwd_or(ct.x,rots,r);
+ for (j=1;j<64;j++)
+ {
+ memset(xt.x,0,sizeof(xt.x));
+ xt.x[i]=((u64b) 1) << j;
+ fwd_or(xt.x,rots,r);
+ for (k=0;k<wCnt;k++)
+ if (left_rot(ct.x[k],j) != xt.x[k])
+ {
+ printf("Quick HW check failure: blk=%4d bits. r=%d. j=%d",wCnt*64,r,j);
+ exit(2);
+ }
+ }
+ }
+#endif
+ }
+ }
+ /* test the "hard coded" versions against variable versions of OR routines */
+ for (tstCnt=0;tstCnt<TEST_CNT;tstCnt++)
+ {
+ RandBytes(rots,sizeof(rots));
+ for (i=0;i<MAX_ROTS_PER_CYCLE;i++)
+ rots[i] &= (BITS_PER_WORD-1);
+ rN = (wCnt == 16) ? 9 : 8;
+ for (i=0;i<wCnt*64;i++)
+ {
+ memset(pt.x,0,sizeof(pt));
+ pt.x[i / 64] = ((u64b) 1) << (i % 64);
+ ct=pt;
+ xt=pt;
+ fwd_or (ct.x,rots,rN);
+ fwd_or_rN(xt.x,rots,rN);
+ if (memcmp(xt.x,ct.x,wCnt*sizeof(xt.x[0])))
+ {
+ printf("OR failure: #%03d: wCnt=%d. i=%2d",tstCnt,wCnt,i);
+ exit(8);
+ }
+ }
+ }
+ }
+ }
+
+/* count the bits set in the word */
+uint HammingWeight(Word x)
+ {
+#if BITS_PER_WORD == 64
+ x = (x & DUP_64(0x55555555)) + ((x >> 1) & DUP_64(0x55555555));
+ x = (x & DUP_64(0x33333333)) + ((x >> 2) & DUP_64(0x33333333));
+ x = (x & DUP_64(0x0F0F0F0F)) + ((x >> 4) & DUP_64(0x0F0F0F0F));
+ x = (x & DUP_64(0x00FF00FF)) + ((x >> 8) & DUP_64(0x00FF00FF));
+ x = (x & DUP_64(0x0000FFFF)) + ((x >>16) & DUP_64(0x0000FFFF));
+ x = (x & DUP_64(0x000000FF)) + ((x >>32) & DUP_64(0x000000FF));
+#else
+ x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+ x = (x & 0x0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F);
+ x = (x & 0x00FF00FF) + ((x >> 8) & 0x00FF00FF);
+ x = (x & 0x0000FFFF) + ((x >>16) & 0x000000FF);
+#endif
+ return (uint) x;
+ }
+
+
+/* use the CRC value as quick ID to help identify/verify rotation sets */
+void Set_CRC(rSearchRec *r)
+ {
+#define CRC_FDBK ((0x04C11DB7u >> 1) ^ 0x80000000u) /* CRC-32-IEEE-802.3 (from Wikipedia) */
+ uint i,h=~0u;
+
+ for (i=0;i<rotsPerCycle;i++)
+ {
+ h ^= r->rotList[i];
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ h = (h & 1) ? (h >> 1) ^ CRC_FDBK : (h >> 1);
+ }
+ r->CRC = h;
+ }
+
+/* qsort routine for search records: keep in descending order */
+int Compare_SearchRec_Descending(const void *aPtr,const void *bPtr)
+ {
+ uint wA = ((const rSearchRec *) aPtr)->rWorst;
+ uint wB = ((const rSearchRec *) bPtr)->rWorst;
+
+ if (wA < wB)
+ return +1;
+ if (wA > wB)
+ return -1;
+ else
+ { /* equal metric. Sort by ID number */
+ wA = ((const rSearchRec *) aPtr)->ID;
+ wB = ((const rSearchRec *) bPtr)->ID;
+ if (wA < wB)
+ return -1;
+ if (wA > wB)
+ return +1;
+ return 0;
+ }
+ }
+
+const char *ASCII_TimeDate(void)
+ {
+ time_t t;
+ time(&t);
+ return ctime(&t);
+ }
+
+/* test the rotation set for minimum hamming weight >= minHW */
+/* [try to do it fast: rely on rotational symmetry using OR, */
+/* and do an early exit if hamming weight is too low] */
+int Cycle_Min_HW(uint rounds, const u08b *rotList,uint minHW,uint verMask)
+ {
+ uint i,j,v,hw,hMin;
+ u08b rots[MAX_ROTS_PER_CYCLE];
+ Block b;
+
+ hMin = BITS_PER_WORD;
+ for (v=0;v<MAX_ROT_VER_CNT;v++)
+ {
+ if ((verMask & (1 << v)) == 0)
+ continue;
+ if (v & 1)
+ { /* do it on the "half-cycle" */
+ for (i=0;i<rotsPerCycle;i++)
+ {
+ rots[i] = rotList[(i >= rotsPerCycle/2) ? i - rotsPerCycle/2 : i + rotsPerCycle/2];
+ }
+ }
+ else
+ memcpy(rots,rotList,rotsPerCycle*sizeof(rots[0]));
+ for (i=0;i<wordsPerBlock;i++)
+ {
+ memset(b.x,0,wordsPerBlock*sizeof(b.x[0]));
+ b.x[i] = 1; /* test propagation into one word */
+ if (minHW)
+ { /* use the "_rN" versions for speed */
+ if (v & 2)
+ rev_cycle_or_rN(b.x,rots,(int)rounds);
+ else
+ fwd_cycle_or_rN(b.x,rots,(int)rounds);
+ }
+ else
+ { /* saturation check */
+ if (v & 2)
+ rev_cycle_or (b.x,rots,(int)rounds);
+ else
+ fwd_cycle_or (b.x,rots,(int)rounds);
+ }
+ for (j=0;j<wordsPerBlock;j++)
+ {
+ hw = HammingWeight(b.x[j]);
+ if (minHW > hw)
+ return 0; /* stop if this isn't good enough */
+ if (hMin > hw) /* else keep track of min */
+ hMin = hw;
+ }
+ }
+ }
+ return hMin;
+ }
+
+/* compute/set the minimum hamming weight of the rotation set */
+/* [more thorough check than Cycle_Min_HW] */
+uint Set_Min_hw_OR(rSearchRec *r,uint verMask,uint rounds)
+ {
+ uint i,j,v,hw,hwMin;
+ u08b rots[MAX_ROTS_PER_CYCLE];
+ Block b;
+
+ Set_CRC(r);
+ hwMin = BITS_PER_WORD;
+ for (v=0;v<MAX_ROT_VER_CNT;v++)
+ {
+ r->hw_OR[v] = BITS_PER_WORD;
+ if ((verMask & (1 << v)) == 0)
+ continue;
+ if (v & 1)
+ { /* do it on the "half-cycle" */
+ for (i=0;i<rotsPerCycle;i++)
+ {
+ rots[i] = r->rotList[(i >= rotsPerCycle/2) ? i - rotsPerCycle/2 : i + rotsPerCycle/2];
+ }
+ }
+ else
+ memcpy(rots,r->rotList,rotsPerCycle*sizeof(rots[0]));
+ for (i=0;i<bitsPerBlock;i+=BITS_PER_WORD)
+ {
+ memset(b.x,0,sizeof(b.x));
+ b.x[i/BITS_PER_WORD] |= (((u64b) 1) << (i%BITS_PER_WORD));
+ if (v & 2)
+ rev_cycle_or(b.x,rots,(int) rounds);
+ else
+ fwd_cycle_or(b.x,rots,(int) rounds);
+ for (j=0;j<wordsPerBlock;j++)
+ {
+ hw = HammingWeight(b.x[j]);
+ if (hwMin > hw)
+ hwMin = hw;
+ if (r->hw_OR[v] > (u08b) hw)
+ r->hw_OR[v] = (u08b) hw;
+ }
+ }
+ }
+ return hwMin;
+ }
+
+/* show how the Hamming weight varies as a function of # rounds */
+void Show_HW_rounds(const u08b *rotates)
+ {
+ uint i,r,minHW,hw[4];
+
+ for (r=4;r<12;r++)
+ {
+ minHW = bitsPerBlock;
+ for (i=0;i<4;i++)
+ {
+ hw[i]=Cycle_Min_HW(r,rotates,0,1 << i);
+ if (minHW > hw[i])
+ minHW = hw[i];
+ }
+ printf("%2d rounds: minHW = %2d [",r,minHW);
+ for (i=0;i<4;i++) /* show the different "versions" */
+ printf(" %2d",hw[i]);
+ printf(" ]\n");
+ }
+ }
+
+/* read rotations value from file */
+const u08b *get_rotation_file(const char *rfName)
+ {
+ enum { MAX_LINE = 512 };
+ char line[MAX_LINE+4];
+ uint i,rotVal;
+ uint rotShow=0;
+ static FILE *rf=NULL;
+ static u08b rotates[MAX_ROTS_PER_CYCLE];
+ static uint rotCnt =0;
+/**** sample format:
++++++++++++++ Preliminary results: sampleCnt = 1024, block = 256 bits
+rMin = 0.425. #079C[*21] [CRC=D89E7C72. hw_OR=62. cnt= 1024. blkSize= 256]
+ 46 52
+ 21 38
+ 13 13
+ 20 27
+ 14 40
+ 43 26
+ 35 29
+ 19 63
+rMin = 0.425. #0646[*17] [CRC=527174F3. hw_OR=61. cnt= 1024. blkSize= 256]
+ 26 24
+ 50 48
+ 40 25
+ 36 55
+ 10 20
+ 10 16
+ 60 55
+ 18 7
+...
+****/
+ if (rfName[0] == '+')
+ {
+ rfName++;
+ rotShow = 1;
+ }
+ if (rf == NULL)
+ {
+ rf = fopen(rfName,"rt");
+ if (rf == NULL)
+ {
+ printf("Unable to open rotation file '%s'",rfName);
+ exit(2);
+ }
+ rotCnt=0;
+ for (;;) /* skip to "preliminary results" section */
+ {
+ line[0]=0;
+ if (fgets(line,sizeof(line)-4,rf) == NULL || line[0] == 0)
+ {
+ fclose(rf); /* eof --> stop */
+ rf = NULL;
+ return NULL;
+ }
+ /* check for the header */
+ if (line[0] != '+' || line[1] != '+' || line[2] != '+' ||
+ strstr(line,"reliminary results:") == NULL)
+ continue;
+ /* now check for the correct block size */
+ for (i=strlen(line);i;i--) /* start at eol and look backwards */
+ if (line[i-1] == '=') /* check for '=' sign for block size */
+ break;
+ if (i > 0 && sscanf(line+i,"%u bits",&i) == 1 && i == bitsPerBlock)
+ break;
+ }
+ }
+ /* now at the rMin line */
+ line[0]=0;
+ if (fgets(line,sizeof(line)-4,rf) == NULL || line[0] == 0 || strncmp(line,"rMin =",6))
+ {
+ fclose(rf);
+ rf = NULL;
+ return NULL;
+ }
+
+ /* now read in all the rotation values */
+ for (i=0;i<rotsPerCycle;i++)
+ {
+ if (fscanf(rf,"%u",&rotVal) != 1 || rotVal >= bitsPerBlock)
+ { /* Invalid rotation value */
+ fclose(rf);
+ rf = NULL;
+ return NULL;
+ }
+ rotates[i] = (u08b) rotVal;
+ }
+ if (fgets(line,sizeof(line)-4,rf) == NULL) /* skip eol */
+ {
+ fclose(rf);
+ rf = NULL;
+ }
+ if (rotShow)
+ { /* show the hamming weight profile */
+ printf("\n:::::::::::\n");
+ printf("Rot #%02d [%4d-bit blocks] read from file '%s':\n",rotCnt,bitsPerBlock,rfName);
+ for (i=0;i<rotsPerCycle;i++)
+ printf("%4d%s",rotates[i],((i+1)%(wordsPerBlock/2))?"":"\n");
+ Show_HW_rounds(rotates); /* show HW results for different numbers of rounds */
+ printf(":::::::::::\n");
+ }
+ rotCnt++;
+ return rotates;
+ }
+
+/* generate a randomly chosen set of rotation constants of given minimum hamming weight (using OR) */
+/* (this may take a while, depending on minHW,rounds) */
+uint get_rotation(rSearchRec *r,testParms t)
+ {
+ static u64b rCnt = 1;
+ static u64b rCntOK = 0;
+ static uint rScale = BITS_PER_WORD;
+ static uint hwBase = 0;
+ static uint rID = 1;
+ uint i,j,k,m,n,b,hw,q,qMask;
+ static u08b rotates[MAX_ROTS_PER_CYCLE]; /* last generated rotation set */
+ u08b goodRots[BITS_PER_WORD];
+ uint goodRotCnt;
+
+ r->rWorst = 0;
+ r->parentCRC = ~0u;
+
+ if (rotFileName) /* get from search results file? */
+ {
+ const u08b *rf = get_rotation_file(rotFileName);
+ if (rf)
+ {
+ for (i=0;i<rotsPerCycle;i++)
+ r->rotList[i] = rf[i];
+ Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+ r->ID = rID++;
+ return 1;
+ }
+ /* here with file exhausted. Keep going with randomized values */
+ rotFileName = NULL; /* don't use file any more */
+ return 0;
+ }
+ for (i=goodRotCnt=0;i<BITS_PER_WORD;i++)
+ if (!RotCnt_Bad(i))
+ {
+ goodRots[goodRotCnt++] = (u08b) i;
+ }
+
+ qMask = ((wordsPerBlock/2)-1) & t.dupRotMask; /* filter for dup rotate counts in the same round? */
+ for (;;rCnt++)
+ {
+ if (hwBase == 0)
+ { /* pick a rotation set at random */
+ for (i=0;i<rotsPerCycle;)
+ {
+ rotates[i] = goodRots[Rand32() % goodRotCnt];
+ /* filter out unapproved rotation sets here */
+ for (q=i & ~qMask;q < i;q++) /* check for dups in the same round */
+ if (rotates[i] == rotates[q])
+ break;
+ if (q >= i) /* no dup, value ok, so this value is ok */
+ i++;
+ }
+ hw = Cycle_Min_HW(t.rounds,rotates,t.minHW_or-t.minOffs,t.rotVerMask);
+ if (hw == 0) /* did we get close? */
+ continue;
+ rCntOK++;
+
+ hwBase = hw;
+ if (hw >= t.minHW_or)
+ if (Cycle_Min_HW(t.maxSatRnds, rotates,0,t.rotVerMask) == BITS_PER_WORD)
+ {
+ for (i=0;i<rotsPerCycle;i++)
+ r->rotList[i] = rotates[i];
+ rScale = 1; /* set up for scaling below */
+ }
+ }
+ /* use odd scaling for randomly generated rotations */
+ for (;rScale < BITS_PER_WORD;)
+ {
+ for (i=0;i<rotsPerCycle;i++)
+ {
+ r->rotList[i] = (rotates[i] * rScale) % BITS_PER_WORD;
+ if (RotCnt_Bad(r->rotList[i]))
+ break;
+ }
+ rScale+=2; /* bump scale factor for next time */
+ if (i >= rotsPerCycle)
+ { /* all values ok: this one's a keeper */
+ Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+ r->ID = rID++;
+ return 1;
+ }
+ }
+ /* Try nearby values to see if hw gets better: monotonic hill climb. */
+ /* -- exhaustively try all possible values of pairs of changes */
+ for (m=0;m<rotsPerCycle;m++)
+ for (b=0;b<BITS_PER_WORD ;b++)
+ {
+ k = rotsPerCycle-1-m; /* work backwards, since we're already close */
+ rotates[k]++;
+ rotates[k] &= (BITS_PER_WORD-1);
+ if (RotCnt_Bad(rotates[k]))
+ continue;
+ for (q=k | qMask;q > k;q--) /* check for dups in the same round */
+ if (rotates[k] == rotates[q])
+ break;
+ if (q > k)
+ continue;
+ for (i=m+1;i<rotsPerCycle;i++)
+ {
+ n = rotsPerCycle-1-i; /* work backwards */
+ for (j=0;j<BITS_PER_WORD;j++)
+ {
+ rotates[n]++; /* try another rotation value */
+ rotates[n] &= (BITS_PER_WORD-1);
+ if (RotCnt_Bad(rotates[n]))
+ continue;
+ for (q=n | qMask;q > n;q--) /* check for dups in the same round */
+ if (rotates[n] == rotates[q])
+ break;
+ if (q > n)
+ continue;
+ k = (t.minHW_or > hwBase) ? t.minHW_or : hwBase;
+ hw = Cycle_Min_HW(t.rounds,rotates,k,t.rotVerMask);
+ if (hw > hwBase)
+ if (Cycle_Min_HW(t.maxSatRnds, rotates,0,t.rotVerMask) == BITS_PER_WORD)
+ { /* must improve hw to accept this new rotation set */
+ assert(hw >= t.minHW_or);
+ hwBase = hw;
+ rScale = 3; /* set up for scaling next time */
+ for (i=0;i<rotsPerCycle;i++)
+ r->rotList[i] = rotates[i];
+ Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+ r->ID = rID++;
+ return 1;
+ }
+ }
+ }
+ }
+ hwBase = 0; /* back to random */
+ }
+ }
+
+/* display a search record result */
+void ShowSearchRec(FILE *f,const rSearchRec *r,testParms t,uint showMode,char markCh,uint showNum)
+ {
+ uint i,j,n,hwMin;
+ const char *s;
+ char fStr[200];
+
+ hwMin=BITS_PER_WORD;
+ for (i=0;i<MAX_ROT_VER_CNT;i++)
+ if (hwMin > (uint) r->hw_OR[i])
+ hwMin = (uint) r->hw_OR[i];
+
+ switch (showMode)
+ {
+ case SHOW_ROTS_FINAL: sprintf(fStr,".final:%02d " ,showNum); s = fStr; break;
+ case SHOW_ROTS_H: s = ".format"; break;
+ case SHOW_ROTS_PRELIM: s = ".prelim"; break;
+ default: s = ""; break;
+ }
+
+ fprintf(f,"rMin = %5.3f.%c [CRC=%08X. parent=%08X. ID=%08X. hw_OR=%2d. cnt=%5d. bits=%4u]%-10s%s%s\n",
+ r->rWorst/(double)t.sampleCnt,markCh,r->CRC,r->parentCRC,r->ID,
+ hwMin,t.sampleCnt,bitsPerBlock,s,
+ (t.tstFlags & TST_FLG_USE_ABS)?" useAbs":"",(r->ID & ID_RECALC_BIT)?" recalc":""
+ );
+
+ switch (showMode)
+ {
+ case SHOW_NONE:
+ break;
+ case SHOW_ROTS_H: /* format for "skein.h" */
+ for (j=n=0;j<rotsPerCycle/(wordsPerBlock/2);j++)
+ {
+ fprintf(f," ");
+ for (i=0;i<wordsPerBlock/2;i++)
+ {
+ fprintf(f,(wordsPerBlock == 16)?" R%04d":" R_%03d",wordsPerBlock*64);
+ fprintf(f,"_%d_%d=%2d,",j,i,r->rotList[n++]);
+ }
+ fprintf(f,"\n");
+ }
+ break;
+ default:
+ for (i=0;i<rotsPerCycle;i++)
+ fprintf(f," %2d%s",r->rotList[i],((i+1)%(wordsPerBlock/2))?"":"\n");
+ break;
+ }
+ }
+
+/* compute Skein differentials for a given rotation set */
+uint CheckDifferentials(rSearchRec *r,testParms t)
+ {
+ enum { HIST_BINS = 20 };
+
+ uint i,j,k,v,n,d,dMax,minCnt,maxCnt,vCnt,q;
+ uint rMin,rMax,hwMin,hwMax,hw,rMinCnt,rMaxCnt,iMin,jMin,iMax,jMax;
+ uint hist[HIST_BINS+1];
+ u08b rots[MAX_ROTS_PER_CYCLE];
+ u64b totSum,w,y,z,oMask;
+ double fSum,fSqr,x,var,denom;
+ static u64b onesCnt[3][MAX_BITS_PER_BLK][MAX_BITS_PER_BLK/8]; /* pack eight 8-bit counts into each u64b (for speed) */
+ u64b *oPtr;
+ struct
+ {
+ Block pt,ct;
+ } a,b;
+
+ r->rWorst = t.sampleCnt;
+ dMax = 1u << (t.diffBits & (BITS_PER_WORD-1));
+ iMin = jMin = iMax = jMax = bitsPerBlock + 1;
+
+ for (v=vCnt=0;v < MAX_ROT_VER_CNT; v++)
+ { /* different versions of rotation schedule, including "inverse" cipher */
+ if ((t.rotVerMask & (1 << v)) == 0)
+ continue;
+ vCnt++; /* number of versions processed */
+ if (v & 1)
+ { /* do it on the "half-cycle" */
+ for (i=0;i<rotsPerCycle;i++)
+ {
+ rots[i] = r->rotList[(i >= rotsPerCycle/2) ? i - rotsPerCycle/2 : i + rotsPerCycle/2];
+ }
+ }
+ else
+ memcpy(rots,r->rotList,rotsPerCycle*sizeof(rots[0]));
+ for (d=1; d < dMax; d+=2) /* multi-bit difference patterns (must start with a '1' bit) */
+ {
+ hwMax=0;
+ hwMin=bitsPerBlock+1;
+ memset(onesCnt,0,sizeof(onesCnt)); /* clear stats before starting */
+
+ oMask = DUP_64(0x01010101); /* mask for adding, 8 bins at a time */
+ for (n=1;n<=t.sampleCnt;n++)
+ {
+ for (i=0;i<wordsPerBlock;i++) /* generate input blocks in a portable way */
+ a.pt.x[i] = Rand64();
+ a.ct = a.pt;
+ if (v & 2)
+ rev_cycle(a.ct.x,rots,t.rounds);
+ else
+ fwd_cycle(a.ct.x,rots,t.rounds);
+ for (i=0;i<bitsPerBlock;i++)
+ {
+ b.pt = a.pt;
+ b.pt.x[i/BITS_PER_WORD] ^= left_rot((u64b)d,(i%BITS_PER_WORD)); /* inject input difference */
+ b.ct = b.pt;
+ if (t.tstFlags & TST_FLG_DO_RAND)
+ RandBytes(b.ct.x,sizeof(b.ct.x)); /* random results as a comparison point */
+ else if (v & 2)
+ rev_cycle(b.ct.x,rots,t.rounds); /* let Skein do the mixing */
+ else
+ fwd_cycle(b.ct.x,rots,t.rounds); /* let Skein do the mixing */
+ z = 0; /* accumulate total hamming weight in z */
+ oPtr = onesCnt[0][i];
+ for (j=0;j<wordsPerBlock;j++)
+ { /* inner-most loop: unroll it fully */
+ w = b.ct.x[j] ^ a.ct.x[j]; /* xor difference in each ciphertext word */
+ y = (w ) & oMask; oPtr[0] += y; z += y; /* sum 8 bins at a time (bits 0,8,16,24...,56) */
+ y = (w >> 1) & oMask; oPtr[1] += y; z += y;
+ y = (w >> 2) & oMask; oPtr[2] += y; z += y; /* do it 8 times to cover all bits in w */
+ y = (w >> 3) & oMask; oPtr[3] += y; z += y;
+
+ y = (w >> 4) & oMask; oPtr[4] += y; z += y;
+ y = (w >> 5) & oMask; oPtr[5] += y; z += y;
+ y = (w >> 6) & oMask; oPtr[6] += y; z += y;
+ y = (w >> 7) & oMask; oPtr[7] += y; z += y;
+ oPtr += 8;
+ }
+ /* sum up the total hamming weight bins (very carefully) */
+ z = (z & DUP_64(0x00FF00FF)) + ((z >> 8) & DUP_64(0x00FF00FF));
+ hw = (uint) (z + (z >> 16) + (z >> 32) + (z >> 48)) & 0xFFFF;
+ if (hwMin > hw) hwMin = hw; /* update total hw min/max stats */
+ if (hwMax < hw) hwMax = hw;
+ }
+ if ((n & 0x7F) == 0)
+ { /* prevent onesCnt[0] overflow by "transferring" MSBs of 8-bit bytes into onesCnt[1] */
+ for (i=0;i<bitsPerBlock ;i++)
+ for (j=0;j<bitsPerBlock/8;j++)
+ { /* add the MSB (bit 7) of each byte into onesCnt[1], then mask it off in onesCnt[0] */
+ onesCnt[1][i][j] += (onesCnt[0][i][j] >> 7) & oMask;
+ onesCnt[0][i][j] &= ~(oMask << 7);
+ }
+ if ((n & 0x3FFF) == 0)
+ { /* propagate overflow into onesCnt[2] (occasionally, as needed) */
+ for (i=0;i<bitsPerBlock ;i++)
+ for (j=0;j<bitsPerBlock/8;j++)
+ {
+ onesCnt[2][i][j] += (onesCnt[1][i][j] >> 7) & oMask;
+ onesCnt[1][i][j] &= ~(oMask << 7);
+ }
+ }
+ }
+ if (n == 32 && d == 1 && (t.tstFlags & TST_FLG_QUICK_EXIT))
+ { /* quick exit if not even close to random looking after a few samples */
+ for (i=0;i<bitsPerBlock ;i++)
+ for (j=0;j<bitsPerBlock/8;j++)
+ {
+ if ((onesCnt[0][i][j] & ~oMask) == 0) /* any count less than 2? */
+ {
+ /** Since an ideal random function has prob=0.5 each for input/output bit
+ ** pair, the expected distribution of onesCnt[i][j] is binomial.
+ ** Thus, at this point, the probability of onesCnt[i][j] < 2 is:
+ ** ((1+32)/2)/(2**-32)
+ ** This probability is roughly 2**(-27), so when we observe such an
+ ** occurrence, we exit immediately to save taking a lot of stats just
+ ** to fail later. This filter significantly speeds up the search, at a
+ ** very low probability of improperly dismissing a "good" rotation set.
+ **/
+ if (t.tstFlags & TST_FLG_SHOW && vCnt > 1)
+ { /* show why we stopped, if we already showed something */
+ printf("%23s/* quick exit: %d/%d */\n","",(uint)onesCnt[0][i][j],n);
+ }
+ return r->rWorst = 0; /* not a good result */
+ }
+ }
+ }
+ }
+ /* now process the stats from the samples we just generated */
+ assert(t.sampleCnt < (1 << 22)); /* 2**22 is big enough not to worry! */
+ memset(hist,0,sizeof(hist));
+ fSum = fSqr = 0.0;
+ denom = 1.0 / (double) t.sampleCnt;
+ rMin = minCnt = ~0u;
+ totSum= rMax = rMinCnt = rMaxCnt = maxCnt = 0;
+ for (i=0;i<bitsPerBlock;i++)
+ {
+ for (j=0;j<bitsPerBlock/8;j++)
+ {
+ w = onesCnt[0][i][j]; /* 7+ bits here */
+ y = onesCnt[1][i][j]; /* 7+ bits here */
+ z = onesCnt[2][i][j]; /* 8 bits here. Total = 22 bits */
+ for (k=0;k<8;k++,w >>= 8,y >>= 8,z >>= 8)
+ {
+ q = (uint) ((w & 0xFF) + ((y & 0xFF) << 7) + ((z & 0xFF) << 14));
+ if (maxCnt < q) { maxCnt = q; iMax = i; jMax = j; if (rMax < q) { rMax = q; rMaxCnt = 0; } }
+ if (minCnt > q) { minCnt = q; iMin = i; jMin = j; if (rMin > q) { rMin = q; rMinCnt = 0; } }
+ if (rMin == minCnt) rMinCnt++;
+ if (rMax == maxCnt) rMaxCnt++;
+ if (t.tstFlags & TST_FLG_SHOW)
+ { /* compute more extensive stats only if showing results below */
+ totSum += q;
+ x = q*denom; /* update stats for stdDev */
+ fSum += x;
+ fSqr += x*x;
+ hist[(uint)floor(x*HIST_BINS)]++; /* track histogram */
+ }
+ }
+ }
+ }
+ if (t.tstFlags & TST_FLG_USE_ABS && rMin > t.sampleCnt - rMax)
+ {
+ rMin = t.sampleCnt - rMax; /* use max variation from 1/2 */
+ iMin = iMax;
+ jMin = jMax;
+ }
+ if (r->rWorst > rMin)
+ {
+ r->rWorst = rMin;
+ if (rMin == 0)
+ { /* if far worse than current best, stop now (to speed up the search) */
+ if (t.tstFlags & TST_FLG_SHOW && (d > 1 || vCnt > 1)) /* show why we stopped, if we already showed something */
+ printf("%23s/* early exit */\n","");
+ return r->rWorst = 0;
+ }
+ }
+ if (t.tstFlags & TST_FLG_SHOW)
+ { /* show some detailed results of the test */
+ if (d == 1)
+ { /* put out the rotation info the first time thru */
+ if ((t.tstFlags & TST_FLG_DO_RAND) == 0)
+ {
+ printf("Rotation set [CRC=%08X. hw_OR=%2d. sampleCnt=%5d. block=%4d bits. v=%d]:\n",
+ r->CRC,r->hw_OR[v],t.sampleCnt,bitsPerBlock,v);
+ if (vCnt == 0)
+ for (i=0;i<rotsPerCycle;i++)
+ printf(" %2d%s",r->rotList[i],((i+1)%(wordsPerBlock/2))?"":"\n");
+ }
+ }
+ printf("rnds=%2d,cnt=%5d",t.rounds,t.sampleCnt);
+ x = fSum/(bitsPerBlock*bitsPerBlock);
+ var= (fSqr/(bitsPerBlock*bitsPerBlock)) - x*x;
+ printf(" min=%5.3f.[%c] max=%5.3f.[%c] hw=%3d..%3d. avg=%7.5f. std=%6.4f. d=%X. [%3d,%3d]",
+ rMin*denom,(rMinCnt > 9) ? '+' : '0'+rMinCnt,
+ rMax*denom,(rMaxCnt > 9) ? '+' : '0'+rMaxCnt,
+ hwMin,hwMax,
+ (totSum*denom)/(bitsPerBlock*bitsPerBlock),sqrt(var),(uint)d,iMin,jMin);
+ if (t.tstFlags & TST_FLG_SHOW_HIST)
+ { /* very wide histogram display */
+ for (i=0;i<=HIST_BINS;i++)
+ if (hist[i])
+ printf(" %7.5f",hist[i]/(double)(bitsPerBlock*bitsPerBlock));
+ else
+ printf(" _ ");
+ }
+ if (t.tstFlags & TST_FLG_DO_RAND)
+ printf(" [RANDOM] ");
+ printf("\n");
+ fflush(stdout);
+ }
+ if (t.tstFlags & TST_FLG_DO_RAND)
+ break; /* no need to do more than one random setting per rotation set */
+ } /* for (d=1;d<dMax;d+=2) */
+ if (t.tstFlags & TST_FLG_DO_RAND)
+ break; /* no need to do more than one random setting per rotation set */
+ }
+ return r->rWorst;
+ }
+
+/* twiddle a bit with an entry, but keep maxSatRounds satisfied */
+void Twiddle(rSearchRec *r,testParms t)
+ {
+ enum { MAX_TWIDDLE_CNT = 100, MAX_ROT_CNT = 6 };
+ uint i,j,k,n,v[MAX_ROT_CNT];
+ u08b old[MAX_ROT_CNT];
+ u64b usedBitmap;
+ u08b goodRots[BITS_PER_WORD];
+ uint goodRotCnt;
+
+ assert(rotsPerCycle <= sizeof(usedBitmap)*8);
+ r->ID += (1 << TWIDDLE_CNT_BIT0); /* bump count of number of times twiddled */
+ r->ID &= ~ID_RECALC_BIT; /* show this one hasn't been had recalc yet */
+ r->parentCRC = r->CRC; /* track genealogy */
+
+ for (i=goodRotCnt=0;i<BITS_PER_WORD;i++)
+ if (!RotCnt_Bad(i))
+ {
+ goodRots[goodRotCnt++] = (u08b) i;
+ }
+
+ n = 1 + (Rand08() % MAX_ROT_CNT);
+ for (i=0;i<4;i++)
+ {
+ usedBitmap = 0;
+ for (j=0;j<n;j++)
+ { /* pick which set of n rotation constants to change */
+ do {
+ v[j] = Rand08() % rotsPerCycle; /* rotation index */
+ }
+ while ((usedBitmap >> v[j]) & 1); /* make sure all v[j] values are unique */
+ usedBitmap |= (((u64b) 1) << v[j]);
+ old[j] = r->rotList[v[j]]; /* save current value */
+ }
+ for (k=0;k<MAX_TWIDDLE_CNT/4;k++)
+ { /* here with n rotation indices (v[0..n-1]) to be changed */
+ for (j=0;j<n;j++)
+ {
+ do {
+ r->rotList[v[j]] = goodRots[Rand32() % goodRotCnt];
+ } /* make sure new rotation value changes */
+ while (r->rotList[v[j]] == old[j]);
+ }
+ if (Cycle_Min_HW(t.maxSatRnds,r->rotList,0,t.rotVerMask) == BITS_PER_WORD)
+ {
+ if (i >= 2 || !(t.tstFlags & TST_FLG_KEEP_MIN_HW) ||
+ Cycle_Min_HW(t.rounds,r->rotList,t.minHW_or,t.rotVerMask) >= (int) t.minHW_or)
+ {
+ Set_Min_hw_OR(r,t.rotVerMask,t.rounds);
+ return;
+ }
+ }
+ for (j=0;j<n;j++) /* didn't work: go back to the old values */
+ r->rotList[v[j]] = old[j];
+ }
+ }
+ /* twiddling failed to produce a valid set (very rare). Select a brand new one */
+ get_rotation(r,t);
+ }
+
+/* run a full search */
+void RunSearch(testParms t)
+ {
+ enum { KEEP_DIV = 16, KEEP_REP = 10, SHOW_CNT = 8 };
+ rSearchRec popList[MAX_POP_CNT+2];
+ uint i,j,k,n,repCnt,genCnt,keepCnt,prevBest[SHOW_CNT],showMask;
+ const char *timeStr;
+ time_t t0,t1;
+
+ Rand_Init(t.seed0 + (((u64b) bitsPerBlock) << 32));
+ memset(prevBest,0,sizeof(prevBest));
+
+ /* now set up the globals according to selected Skein blocksize */
+ switch (bitsPerBlock)
+ {
+ case 256:
+ t.genCntMax = (t.genCntMax) ? t.genCntMax : DEFAULT_GEN_CNT_4 ;
+ t.rounds = (t.rounds) ? t.rounds : DEFAULT_ROUND_CNT_4;
+ t.minHW_or = (t.minHW_or) ? t.minHW_or : MIN_HW_OR_4;
+ t.maxSatRnds = (t.maxSatRnds)? t.maxSatRnds : MAX_SAT_ROUNDS_4;
+ fwd_cycle_or_rN = (t.rounds!=8) ? fwd_cycle_4_or : fwd_cycle_4_or_r8 ;
+ rev_cycle_or_rN = (t.rounds!=8) ? rev_cycle_4_or : rev_cycle_4_or_r8 ;
+ fwd_cycle_or = fwd_cycle_4_or;
+ rev_cycle_or = fwd_cycle_4_or;
+ fwd_cycle = fwd_cycle_4;
+ rev_cycle = rev_cycle_4;
+ showMask = 7;
+ break;
+ case 512:
+ t.genCntMax = (t.genCntMax) ? t.genCntMax : DEFAULT_GEN_CNT_8 ;
+ t.rounds = (t.rounds) ? t.rounds : DEFAULT_ROUND_CNT_8;
+ t.minHW_or = (t.minHW_or) ? t.minHW_or : MIN_HW_OR_8;
+ t.maxSatRnds = (t.maxSatRnds)? t.maxSatRnds : MAX_SAT_ROUNDS_8;
+ fwd_cycle_or_rN = (t.rounds!=8) ? fwd_cycle_8_or : fwd_cycle_8_or_r8 ;
+ rev_cycle_or_rN = (t.rounds!=8) ? rev_cycle_8_or : rev_cycle_8_or_r8 ;
+ fwd_cycle_or = fwd_cycle_8_or;
+ rev_cycle_or = rev_cycle_8_or;
+ fwd_cycle = fwd_cycle_8;
+ rev_cycle = rev_cycle_8;
+ showMask = 3;
+ break;
+ case 1024:
+ t.genCntMax = (t.genCntMax) ? t.genCntMax : DEFAULT_GEN_CNT_16 ;
+ t.rounds = (t.rounds) ? t.rounds : DEFAULT_ROUND_CNT_16;
+ t.minHW_or = (t.minHW_or) ? t.minHW_or : MIN_HW_OR_16;
+ t.maxSatRnds = (t.maxSatRnds)? t.maxSatRnds : MAX_SAT_ROUNDS_16;
+ fwd_cycle_or_rN = (t.rounds!=9) ? fwd_cycle_16_or: fwd_cycle_16_or_r9 ;
+ rev_cycle_or_rN = (t.rounds!=9) ? rev_cycle_16_or: rev_cycle_16_or_r9 ;
+ fwd_cycle_or = fwd_cycle_16_or;
+ rev_cycle_or = rev_cycle_16_or;
+ fwd_cycle = fwd_cycle_16;
+ rev_cycle = rev_cycle_16;
+ showMask = 1;
+ break;
+ default:
+ printf("Invalid block size!");
+ exit(2);
+ }
+ if (t.popCnt > MAX_POP_CNT)
+ t.popCnt = MAX_POP_CNT;
+ if (t.popCnt < MIN_POP_CNT)
+ t.popCnt = MIN_POP_CNT;
+ wordsPerBlock = bitsPerBlock / BITS_PER_WORD;
+ rotsPerCycle = (wordsPerBlock / 2) * ROUNDS_PER_CYCLE;
+
+ keepCnt = t.popCnt/KEEP_DIV;
+ assert(keepCnt*(1+KEEP_REP) <= t.popCnt);
+
+ printf("******************************************************************\n");
+ printf("Random seed = %u. BlockSize =%4d bits. sampleCnt =%6d. rounds = %2d. minHW_or=%d. CPU = %d-bit\n",
+ t.seed0,bitsPerBlock,t.sampleCnt,t.rounds,t.minHW_or,(uint)sizeof(size_t)*8);
+ printf("Population = %d. keepCnt = %d. repCnt = %d. rest = %d. keepMinHW = %d\n",
+ t.popCnt,keepCnt,KEEP_REP,t.popCnt-keepCnt*(1+KEEP_REP),(t.tstFlags & TST_FLG_KEEP_MIN_HW)?1:0);
+ timeStr = ASCII_TimeDate();
+ if (t.tstFlags & TST_FLG_STDERR)
+ {
+ fprintf(stderr,"Start: %sBlock size = %d bits. popCnt = %d. sampleCnt = %d. keepMinHW = %d",
+ timeStr,bitsPerBlock,t.popCnt,t.sampleCnt,(t.tstFlags & TST_FLG_KEEP_MIN_HW)?1:0);
+ if (t.runHours)
+ fprintf(stderr,". run time = %d hours",t.runHours);
+ fprintf(stderr,"\n");
+ }
+ else
+ showMask = 0;
+ printf("Start: %s \n",timeStr);
+ time(&t0);
+ fflush(stdout);
+
+ for (n=0;n<t.popCnt;n++)
+ { /* initialize the population with rotations that have "reasonable" hw_OR */
+ if (t.tstFlags & TST_FLG_STDERR)
+ fprintf(stderr,"\rGetRot: %04X \r",t.popCnt-n);
+ if (get_rotation(&popList[n],t) == 0)
+ t.popCnt = n; /* stop after end of file read in */
+ }
+ if (t.tstFlags & TST_FLG_STDERR)
+ fprintf(stderr,"\r%25s\r","");
+
+ for (genCnt=0;genCnt < t.genCntMax;genCnt++)
+ { /* advance to the next generation */
+ for (i=0;i<t.popCnt;i++)
+ { /* generate stats for all entries (this loop is where all the time is spent!) */
+ if ((i & showMask) == 1)
+ fprintf(stderr,"#%04X \r",t.popCnt-i);
+ if (genCnt == 0 || i >= keepCnt)
+ {
+ CheckDifferentials(&popList[i],t);
+ }
+ else if (i <= keepCnt/2 && (popList[i].ID & ID_RECALC_BIT) == 0)
+ { /* recalc with bigger sampleCnt for better accuracy */
+ t.sampleCnt <<= 2;
+ CheckDifferentials(&popList[i],t);
+ t.sampleCnt >>= 2;
+ popList[i].rWorst = (popList[i].rWorst + 2) / 4;
+ popList[i].ID |= ID_RECALC_BIT;
+ }
+ }
+ qsort(popList,t.popCnt,sizeof(popList[0]),Compare_SearchRec_Descending);
+ if (t.genCntMax == 1)
+ { keepCnt = t.popCnt; break; } /* allow quick processing from file */
+ /* now update the population for the next generation */
+ n = t.popCnt-1; /* start discarding at the end of the list */
+ for (i=0;i<keepCnt;i++)
+ {
+ if (t.tstFlags & TST_FLG_WEIGHT_REP)
+ repCnt = (i < keepCnt/2) ? KEEP_REP+2 : KEEP_REP-2 ;
+ else
+ repCnt = KEEP_REP;
+ for (j=0;j<repCnt;j++,n--)
+ { /* replicate the best ones, replacing the worst ones */
+ popList[n] = popList[i];
+ if (j == 0)
+ { /* splice two together, but only if they are from the same initial rotation set */
+ k = Rand32() % keepCnt;
+ if (((popList[n].ID ^ popList[k].ID) & ID_NUM_MASK) == 0)
+ memcpy(popList[n].rotList,
+ popList[k].rotList,
+ rotsPerCycle*sizeof(popList[n].rotList[0])/2);
+ }
+ Twiddle(&popList[n],t); /* tweak the replicate entry a bit */
+ assert(n >= keepCnt); /* sanity check */
+ }
+ }
+ for (;n>=keepCnt;n--) /* just tweak the rest */
+ {
+ Twiddle(&popList[n],t);
+ }
+ time(&t1);
+ /* show current best */
+ if (t.tstFlags & TST_FLG_STDERR)
+ { /* first to stderr (assuming redirected stdout */
+ fprintf(stderr,"\r%4d: ",genCnt+1);
+ for (i=j=0;i<SHOW_CNT;i++)
+ {
+ fprintf(stderr," %5.3f%c",popList[i].rWorst/(double)t.sampleCnt,(popList[i].ID & ID_RECALC_BIT)?'r':' ');
+ j |= (popList[i].rWorst ^ prevBest[i]); /* track changes */
+ prevBest[i] = popList[i].rWorst;
+ }
+ fprintf(stderr," {%6d sec%c}\n",(uint)(t1-t0),(j) ? '*':' ');
+ }
+ if (t.tstFlags & TST_FLG_VERBOSE)
+ { /* then more details to stdout */
+ printf("::::: Gen =%5d. Best =%6.3f. PopCnt =%5d. SampleCnt =%5d. time=%6d.\n",
+ genCnt+1,popList[0].rWorst/(double)t.sampleCnt,t.popCnt,t.sampleCnt,(uint)(t1-t0));
+ for (i=0;i<keepCnt;i++)
+ ShowSearchRec(stdout,&popList[i],t,SHOW_ROTS_PRELIM,(i)?' ':'-',i+1);
+ fflush(stdout);
+ }
+ if (t.runHours && t.runHours*3600 < (uint) (t1 - t0))
+ break; /* timeout? */
+ }
+
+ /* re-grade the top entries using larger sampleCnt values */
+ printf("\n+++++++++++++ Preliminary results: sampleCnt = %5d, block = %4d bits\n",t.sampleCnt,bitsPerBlock);
+ qsort(popList,keepCnt,sizeof(popList[0]),Compare_SearchRec_Descending);
+ for (i=0;i<keepCnt;i++)
+ ShowSearchRec(stdout,&popList[i],t,SHOW_ROTS_PRELIM,' ',i+1);
+
+ /* re-run several times, since there will be statistical variations */
+ t.rotVerMask = MAX_ROT_VER_MASK;
+ t.diffBits = (t.diffBits & 0x100) ? t.diffBits : 3;
+ t.sampleCnt *= 2;
+ t.tstFlags |= TST_FLG_SHOW;
+ t.tstFlags &= (TST_FLG_STDERR | TST_FLG_SHOW | TST_FLG_USE_ABS | TST_FLG_CHECK_ONE | TST_FLG_SHOW_HIST);
+
+ for (j=0;j < ((t.tstFlags & TST_FLG_CHECK_ONE) ? 1u:2u) ;j++)
+ { /* do it twice, once with and once without USE_ABS, unless TST_FLG_CHECK_ONE set */
+ if (!(t.tstFlags & TST_FLG_CHECK_ONE))
+ t.tstFlags ^= TST_FLG_USE_ABS;
+ for (n=0;n<t.regradeCnt;n++)
+ {
+ t.sampleCnt *= 2;
+ printf("+++ Re-running differentials with sampleCnt = %d, blockSize = %4d bits.%s\n",
+ t.sampleCnt,bitsPerBlock,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+ for (i=0;i<keepCnt;i++)
+ {
+ if (t.tstFlags & TST_FLG_STDERR)
+ fprintf(stderr," Re-run: samples=%d, blk=%4d. #%02d.%s \r",
+ t.sampleCnt,bitsPerBlock,keepCnt-i,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+ CheckDifferentials(&popList[i],t);
+ fflush(stdout);
+ }
+ if (keepCnt == 1)
+ { /* show random comparison for final values */
+ printf(" RANDOM OUTPUT: /* useful stats for comparison to 'ideal' */\n");
+ t.tstFlags |= TST_FLG_DO_RAND;
+ for (i=0;i<2;i++)
+ {
+ popList[keepCnt] = popList[keepCnt-1];
+ CheckDifferentials(&popList[keepCnt],t);
+ }
+ t.tstFlags &= ~TST_FLG_DO_RAND;
+ }
+ /* sort per new stats */
+ if (t.tstFlags & TST_FLG_STDERR)
+ fprintf(stderr,"\r%60s\r","");
+ printf("\n+++++++++++++ Final results: sampleCnt = %5d, blockSize = %4d bits.%s\n",
+ t.sampleCnt,bitsPerBlock,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+ qsort(popList,keepCnt,sizeof(popList[0]),Compare_SearchRec_Descending);
+ for (i=keepCnt;i;i--)
+ ShowSearchRec(stdout,&popList[i-1],t,SHOW_ROTS_FINAL,(i==1)?'-':' ',i);
+ fflush(stdout);
+ }
+ printf("\n+++++++++++++ Formatted results: sampleCnt = %5d, blockSize = %4d bits. %s\n",
+ t.sampleCnt,bitsPerBlock,(t.tstFlags & TST_FLG_USE_ABS)?" absDiff":"" );
+ for (i=keepCnt;i;i--)
+ {
+ ShowSearchRec(stdout,&popList[i-1],t,SHOW_ROTS_H,' ',i);
+ printf("\n");
+ Show_HW_rounds(popList[i-1].rotList);
+ printf("\n");
+ }
+ fflush(stdout);
+ t.sampleCnt >>= n; /* revert to original sampleCnt */
+ }
+
+ time(&t1);
+ printf("End: %s\n",ASCII_TimeDate());
+ printf("Elapsed time = %6.3f hours\n\n",(t1-t0)/(double)3600.0);
+ if (t.tstFlags & TST_FLG_STDERR)
+ fprintf(stderr,"\r%60s\n",""); /* clear the screen if needed */
+ fflush(stdout);
+ }
+
+void GiveHelp(void)
+ {
+ printf("Usage: skein_rot_search [options/flags]\n"
+ "Options: -Bnn = set Skein block size in bits (default=512)\n"
+ " -Cnn = set count of random differentials taken\n"
+ " -Dnn = set number bits of difference pattern tested (default=1)\n"
+ " -Gnn = set min invalid rotation value (default 0)\n"
+ " -Inn = set rotation version mask\n"
+ " -Onn = set Hamming weight offset\n"
+ " -Pnn = set population count\n"
+ " -Rnn = set round count\n"
+ " -Snn = set initial random seed (0 --> randomize)\n"
+ " -Tnn = set max time to run (in hours)\n"
+ " -Wnn = set minimum hamming weight\n"
+ " -Xnn = set max test rotation count\n"
+ " -Znn = set max rounds needed for saturation using OR\n"
+ " @file = read rotations from file\n"
+ "Flags: -A = use min, not absolute difference\n"
+ " -E = no stderr output\n"
+ " -H = show histogram (very wide)\n"
+ " -K = keep minHW_or during twiddling\n"
+ " -Q = disable quick exit in search\n"
+ " -U = weighted repeat count (repeat best more frequently)\n"
+ " -V = verbose mode\n"
+ );
+ exit(0);
+ }
+
+int main(int argc,char *argv[])
+ {
+ uint i,bMin,bMax;
+ testParms t;
+ uint chkInv = 1; /* check inverse functions at startup (slow for debbuging) */
+ uint goodRot= 2; /* first allowed rotation value (+/-) */
+ uint seed = 1; /* 0 = randomize based on time, else use specified seed */
+ uint do8 = 0; /* optimize 8-bit CPU performance */
+
+ t.rounds = 0; /* number of Skein rounds to test */
+ t.minHW_or = 0; /* minHW (using OR) required */
+ t.minOffs = 4; /* heuristic used to speed up rotation search */
+ t.diffBits = 1; /* # consecutive bits of differential inputs tested */
+ t.sampleCnt = 1024; /* number of differential pairs tested */
+ t.genCntMax = 0; /* number of "generations" tested */
+ t.maxSatRnds= 0; /* number of rounds to Hamming weight "saturation" */
+ t.rotVerMask= 3; /* mask of which versions to run */
+ t.runHours = 0; /* stop searching after this many hours */
+ t.dupRotMask= 0; /* default is to allow same rotation value in a round */
+ t.regradeCnt= 3; /* how many scaled up counts to try */
+ t.popCnt = DEFAULT_POP_CNT; /* size of population */
+ t.tstFlags = TST_FLG_STDERR | TST_FLG_VERBOSE | TST_FLG_USE_ABS | TST_FLG_CHECK_ONE; /* default flags */
+
+ for (i=1;i<(uint)argc;i++)
+ { /* parse command line args */
+ if (argv[i][0] == '?')
+ GiveHelp();
+ else if (argv[i][0] == '-' || argv[i][0] == '+')
+ {
+#define arg_toi(s) atoi(s + ((s[2] == '=') ? 3 : 2))
+ switch (toupper(argv[i][1]))
+ {
+ case '?': GiveHelp(); break;
+
+ case 'A': t.tstFlags &= ~TST_FLG_USE_ABS; break;
+ case 'E': t.tstFlags &= ~TST_FLG_STDERR; break;
+ case 'H': t.tstFlags |= TST_FLG_SHOW_HIST; break;
+ case 'K': t.tstFlags |= TST_FLG_KEEP_MIN_HW; break;
+ case 'Q': t.tstFlags |= TST_FLG_QUICK_EXIT; break;
+ case 'U': t.tstFlags |= TST_FLG_WEIGHT_REP; break;
+ case 'V': t.tstFlags &= ~TST_FLG_VERBOSE; break;
+ case '1': t.tstFlags &= ~TST_FLG_CHECK_ONE; break;
+
+ case 'B': bitsPerBlock = arg_toi(argv[i]); break;
+ case 'C': t.sampleCnt = arg_toi(argv[i]); break;
+ case 'D': t.diffBits = arg_toi(argv[i]); break;
+ case 'G': goodRot = arg_toi(argv[i]); break;
+ case 'I': t.rotVerMask = arg_toi(argv[i]); break;
+ case 'J': t.regradeCnt = arg_toi(argv[i]); break;
+ case 'O': t.minOffs = arg_toi(argv[i]); break;
+ case 'P': t.popCnt = arg_toi(argv[i]); break;
+ case 'R': t.rounds = arg_toi(argv[i]); break;
+ case 'S': seed = arg_toi(argv[i]); break;
+ case 'T': t.runHours = arg_toi(argv[i]); break;
+ case 'W': t.minHW_or = arg_toi(argv[i]); break;
+ case 'X': t.genCntMax = arg_toi(argv[i]); break;
+ case 'Z': t.maxSatRnds = arg_toi(argv[i]); break;
+ case '2': t.dupRotMask = ~0u; break;
+ case '0': chkInv = 0; break;
+ case '8': do8 = 1; break;
+
+ default : printf("Unknown option: %s\n",argv[i]); GiveHelp(); break;
+ }
+ }
+ else if (argv[i][0] == '@')
+ {
+ rotFileName = argv[i]+1;
+ t.genCntMax = 1; /* stop after one generation */
+ }
+ }
+
+ if (chkInv)
+ InverseChecks(); /* check fwd vs. rev transforms (slow in debugger) */
+
+ t.goodRotCntMask = 0;
+ for (i=goodRot; i <= BITS_PER_WORD - goodRot ;i++)
+ t.goodRotCntMask |= (((u64b) 1) << i);
+ if (do8)
+ t.goodRotCntMask = (((u64b) 0x03838383) << 32) | 0x83838380;
+
+ if (bitsPerBlock == 0)
+ {
+ printf("Running search for all Skein block sizes (256, 512, and 1024)\n");
+ t.rounds = 0; /* use defaults, since otherwise it makes little sense */
+ t.minHW_or = 0;
+ }
+
+ bMin = (bitsPerBlock) ? bitsPerBlock : 256;
+ bMax = (bitsPerBlock) ? bitsPerBlock : 1024;
+
+ for (bitsPerBlock=bMin;bitsPerBlock<=bMax;bitsPerBlock*=2)
+ {
+ t.seed0 = (seed) ? seed : (uint) time(NULL); /* randomize based on time if -s0 is given */
+ RunSearch(t);
+ }
+
+ return 0;
+ }
diff --git a/Additional_Implementations/skein_test.c b/Additional_Implementations/skein_test.c
new file mode 100644
index 000000000000..9d999e0d49c0
--- /dev/null
+++ b/Additional_Implementations/skein_test.c
@@ -0,0 +1,1380 @@
+/***********************************************************************
+**
+** Test/verification code for the Skein block functions.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Testing:
+** - buffering of incremental calls (random cnt steps)
+** - partial input byte handling
+** - output sample hash results (for comparison of ref vs. optimized)
+** - performance
+**
+***********************************************************************/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <time.h>
+#include <assert.h>
+
+#include "skein.h"
+#include "SHA3api_ref.h"
+
+static const uint_t HASH_BITS[] = /* list of hash hash lengths to test */
+ { 160,224,256,384,512,1024, 256+8,512+8,1024+8,2048+8 };
+
+#define HASH_BITS_CNT (sizeof(HASH_BITS)/sizeof(HASH_BITS[0]))
+
+/* bits of the verbose flag word */
+#define V_KAT_LONG (1u << 0)
+#define V_KAT_SHORT (1u << 1)
+#define V_KAT_NO_TREE (1u << 2)
+#define V_KAT_NO_SEQ (1u << 3)
+#define V_KAT_NO_3FISH (1u << 4)
+#define V_KAT_DO_3FISH (1u << 5)
+
+/* automatic compiler version number detection */
+#if !defined(CompilerVersion)
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+#define CompilerVersion (900)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+#define CompilerVersion (600)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1000)
+#define CompilerVersion (420)
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define CompilerVersion (100*__GNUC__ + 10*__GNUC_MINOR__ + __GNUC_PATCHLEVEL__)
+#elif defined(__BORLANDC__) /* this is in hex */
+#define CompilerVersion (100*(__BORLANDC__ >> 8) + 10*((__BORLANDC__ >> 4) & 0xF) + (__BORLANDC__ & 0xF))
+#endif
+
+#endif
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+/* external functions to determine code size (in bytes) */
+size_t Skein_256_Process_Block_CodeSize(void);
+size_t Skein_512_Process_Block_CodeSize(void);
+size_t Skein1024_Process_Block_CodeSize(void);
+size_t Skein_256_API_CodeSize(void);
+size_t Skein_512_API_CodeSize(void);
+size_t Skein1024_API_CodeSize(void);
+uint_t Skein_256_Unroll_Cnt(void);
+uint_t Skein_512_Unroll_Cnt(void);
+uint_t Skein1024_Unroll_Cnt(void);
+#elif defined(SKEIN_LOOP)
+uint_t Skein_256_Unroll_Cnt(void) { return (SKEIN_LOOP / 100) % 10; }
+uint_t Skein_512_Unroll_Cnt(void) { return (SKEIN_LOOP / 10) % 10; }
+uint_t Skein1024_Unroll_Cnt(void) { return (SKEIN_LOOP ) % 10; }
+#else
+uint_t Skein_256_Unroll_Cnt(void) { return 0; }
+uint_t Skein_512_Unroll_Cnt(void) { return 0; }
+uint_t Skein1024_Unroll_Cnt(void) { return 0; }
+#endif
+
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+
+/********************** debug i/o helper routines **********************/
+void FatalError(const char *s,...)
+ { /* print out a msg and exit with an error code */
+ va_list ap;
+ va_start(ap,s);
+ vprintf(s,ap);
+ va_end(ap);
+ printf("\n");
+ exit(2);
+ }
+
+static uint_t _quiet_ = 0; /* quiet processing? */
+static uint_t verbose = 0; /* verbose flag bits */
+static uint_t katHash = ~0u; /* use as a quick check on KAT results */
+
+void ShowBytes(uint_t cnt,const u08b_t *b)
+ { /* formatted output of byte array */
+ uint_t i;
+
+ for (i=0;i < cnt;i++)
+ {
+ if (i %16 == 0) printf(" ");
+ else if (i % 4 == 0) printf(" ");
+ printf(" %02X",b[i]);
+ katHash = (katHash ^ b[i]) * 0xDEADBEEF;
+ katHash = (katHash ^ (katHash >> 23) ^ (katHash >> 17) ^ (katHash >> 9)) * 0xCAFEF00D;
+ if (i %16 == 15 || i==cnt-1) printf("\n");
+ }
+ }
+
+#ifndef SKEIN_DEBUG
+uint_t skein_DebugFlag = 0; /* dummy flags (if not defined elsewhere) */
+#endif
+
+#define SKEIN_DEBUG_SHORT (SKEIN_DEBUG_HDR | SKEIN_DEBUG_STATE | SKEIN_DEBUG_TWEAK | SKEIN_DEBUG_KEY | SKEIN_DEBUG_INPUT_08 | SKEIN_DEBUG_FINAL)
+#define SKEIN_DEBUG_DEFAULT (SKEIN_DEBUG_SHORT)
+
+void Show_Debug(const char *s,...)
+ {
+ if (skein_DebugFlag) /* are we showing debug info? */
+ {
+ va_list ap;
+ va_start(ap,s);
+ vprintf(s,ap);
+ va_end(ap);
+ }
+ }
+
+/************** Timing routine (for performance measurements) ***********/
+/* unfortunately, this is generally assembly code and not very portable */
+
+#if defined(_M_IX86) || defined(__i386) || defined(_i386) || defined(__i386__) || defined(i386) || \
+ defined(_X86_) || defined(__x86_64__) || defined(_M_X64) || defined(__x86_64)
+#define _Is_X86_ 1
+#endif
+
+#if defined(_Is_X86_) && (!defined(__STRICT_ANSI__)) && (defined(__GNUC__) || !defined(__STDC__)) && \
+ (defined(__BORLANDC__) || defined(_MSC_VER) || defined(__MINGW_H) || defined(__GNUC__))
+#define HI_RES_CLK_OK 1 /* it's ok to use RDTSC opcode */
+
+#if defined(_MSC_VER) && defined(_M_X64)
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+#endif
+
+#endif
+
+uint_32t HiResTime(void)
+ {
+#if defined(HI_RES_CLK_OK)
+ uint_32t x[2];
+#if defined(__BORLANDC__)
+#define COMPILER_ID "BCC"
+ _asm { push edx };
+ __emit__(0x0F,0x31); /* RDTSC instruction */
+ _asm { pop edx };
+ _asm { mov x[0],eax };
+#elif defined(_MSC_VER)
+#define COMPILER_ID "MSC"
+#if defined(_MSC_VER) && defined(_M_X64)
+ x[0] = (uint_32t) __rdtsc();
+#else
+ _asm { push edx };
+ _asm { _emit 0fh }; _asm { _emit 031h };
+ _asm { pop edx };
+ _asm { mov x[0],eax };
+#endif
+#elif defined(__MINGW_H) || defined(__GNUC__)
+#define COMPILER_ID "GCC"
+ asm volatile("rdtsc" : "=a"(x[0]), "=d"(x[1]));
+#else
+#error "HI_RES_CLK_OK -- but no assembler code for this platform (?)"
+#endif
+ return x[0];
+#else
+ /* avoid annoying MSVC 9.0 compiler warning #4720 in ANSI mode! */
+#if (!defined(_MSC_VER)) || (!defined(__STDC__)) || (_MSC_VER < 1300)
+ FatalError("No support for RDTSC on this CPU platform\n");
+#endif
+ return 0;
+#endif /* defined(HI_RES_CLK_OK) */
+ }
+
+/******** OS-specific calls for setting priorities and sleeping ******/
+#if (defined(_MSC_VER) && (_MSC_VER >= 1300) && !defined(__STRICT_ANSI__) && !defined(__STDC__)) \
+ && defined(_M_X64)
+#include <Windows.h>
+#include <WinBase.h>
+
+#ifdef SKEIN_FORCE_LOCK_CPU /* NielsF says this is not a good way to do things */
+#define SKEIN_LOCK_CPU_OK (1)
+int Lock_CPU(void)
+ { /* lock this process to this CPU for perf timing */
+ /* -- thanks to Brian Gladman for this code */
+ HANDLE ph;
+ DWORD_PTR afp;
+ DWORD_PTR afs;
+ ph = GetCurrentProcess();
+ if(GetProcessAffinityMask(ph, &afp, &afs))
+ {
+ afp &= (((size_t)1u) << GetCurrentProcessorNumber());
+ if(!SetProcessAffinityMask(ph, afp))
+ return 1;
+ }
+ else
+ {
+ return 2;
+ }
+ return 0; /* success */
+ }
+#endif
+
+#define _GOT_OS_SLEEP (1)
+void OS_Sleep(uint_t msec)
+ {
+ Sleep(msec);
+ }
+
+#define _GOT_OS_SET_PRIORITY (1)
+int OS_Set_High_Priority(void)
+ {
+ if(!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST))
+ return 1;
+#ifdef SKEIN_LOCK_CPU_OK
+ if (Lock_CPU())
+ return 2;
+#endif
+ return 0;
+ }
+
+int OS_Set_Normal_Priority(void)
+ {
+ if(!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL))
+ return 1;
+ return 0;
+ }
+#endif
+
+#if defined(__linux) || defined(__linux__) || defined(linux) || defined(__gnu_linux__)
+#include <unistd.h>
+#define _GOT_OS_SLEEP (1)
+void OS_Sleep(uint_t mSec)
+ {
+ usleep(mSec*1000);
+ }
+#endif
+
+#ifndef _GOT_OS_SET_PRIORITY
+/* dummy routines if nothing is available */
+int OS_Set_High_Priority(void)
+ {
+ return 0;
+ }
+int OS_Set_Normal_Priority(void)
+ {
+ return 0;
+ }
+#endif
+
+#ifndef _GOT_OS_SLEEP
+uint_32t OS_Sleep(uint_32t mSec)
+ {
+ return mSec; /* avoid compiler warnings */
+ }
+#endif
+
+#ifndef COMPILER_ID
+#define COMPILER_ID "(unknown)"
+#endif
+/********************** use RC4 to generate test data ******************/
+/* Note: this works identically on all platforms (big/little-endian) */
+static struct
+ {
+ uint_t I,J; /* RC4 vars */
+ u08b_t state[256];
+ } prng;
+
+void RandBytes(void *dst,uint_t byteCnt)
+ {
+ u08b_t a,b;
+ u08b_t *d = (u08b_t *) dst;
+
+ for (;byteCnt;byteCnt--,d++) /* run RC4 */
+ {
+ prng.I = (prng.I+1) & 0xFF;
+ a = prng.state[prng.I];
+ prng.J = (prng.J+a) & 0xFF;
+ b = prng.state[prng.J];
+ prng.state[prng.I] = b;
+ prng.state[prng.J] = a;
+ *d = prng.state[(a+b) & 0xFF];
+ }
+ }
+
+/* get a pseudo-random 32-bit integer in a portable way */
+uint_t Rand32(void)
+ {
+ uint_t i,n;
+ u08b_t tmp[4];
+
+ RandBytes(tmp,sizeof(tmp));
+
+ for (i=n=0;i<sizeof(tmp);i++)
+ n = n*256 + tmp[i];
+
+ return n;
+ }
+
+/* init the (RC4-based) prng */
+void Rand_Init(u64b_t seed)
+ {
+ uint_t i,j;
+ u08b_t tmp[512];
+
+ /* init the "key" in an endian-independent fashion */
+ for (i=0;i<8;i++)
+ tmp[i] = (u08b_t) (seed >> (8*i));
+
+ /* initialize the permutation */
+ for (i=0;i<256;i++)
+ prng.state[i]=(u08b_t) i;
+
+ /* now run the RC4 key schedule */
+ for (i=j=0;i<256;i++)
+ {
+ j = (j + prng.state[i] + tmp[i%8]) & 0xFF;
+ tmp[256] = prng.state[i];
+ prng.state[i] = prng.state[j];
+ prng.state[j] = tmp[256];
+ }
+ prng.I = prng.J = 0; /* init I,J variables for RC4 */
+
+ /* discard initial keystream before returning */
+ RandBytes(tmp,sizeof(tmp));
+ }
+
+/***********************************************************************/
+/* An AHS-like API that allows explicit setting of block size */
+/* [i.e., the AHS API selects a block size based solely on the ] */
+/* [hash result length, while Skein allows independent hash ] */
+/* [result size and block size ] */
+/***********************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+int Skein_Init(int blkSize,hashState *state, int hashbitlen)
+ {
+ switch (blkSize)
+ {
+ case 256:
+ state->statebits = 64*SKEIN_256_STATE_WORDS;
+ return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+ case 512:
+ state->statebits = 64*SKEIN_512_STATE_WORDS;
+ return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+ case 1024:
+ state->statebits = 64*SKEIN1024_STATE_WORDS;
+ return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+ default:
+ return SKEIN_FAIL;
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init (extended) the context */
+int Skein_InitExt(int blkSize,hashState *state, int hashbitlen,u64b_t treeInfo,const u08b_t *key,size_t keyBytes)
+ {
+ switch (blkSize)
+ {
+ case 256:
+ state->statebits = 64*SKEIN_256_STATE_WORDS;
+ return Skein_256_InitExt(&state->u.ctx_256,(size_t) hashbitlen,treeInfo,key,keyBytes);
+ case 512:
+ state->statebits = 64*SKEIN_512_STATE_WORDS;
+ return Skein_512_InitExt(&state->u.ctx_512,(size_t) hashbitlen,treeInfo,key,keyBytes);
+ case 1024:
+ state->statebits = 64*SKEIN1024_STATE_WORDS;
+ return Skein1024_InitExt(&state->u.ctx1024,(size_t) hashbitlen,treeInfo,key,keyBytes);
+ default:
+ return SKEIN_FAIL;
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+int Skein_Update(hashState *state, const BitSequence *data, DataLength databitlen)
+ {
+ /* only the final Update() call is allowed do partial bytes, else assert an error */
+ Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL);
+
+ if ((databitlen & 7) == 0)
+ {
+ switch (state->statebits)
+ {
+ case 512: return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+ case 256: return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+ case 1024: return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+ default: return SKEIN_FAIL;
+ }
+ }
+ else
+ {
+ size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle */
+ u08b_t mask,*p;
+
+#if (!defined(_MSC_VER)) || (MSC_VER >= 1200) /* MSC v4.2 gives (invalid) warning here!! */
+ Skein_assert(&state->u.h == &state->u.ctx_256.h); /* sanity checks: allow u.h --> all contexts */
+ Skein_assert(&state->u.h == &state->u.ctx_512.h);
+ Skein_assert(&state->u.h == &state->u.ctx1024.h);
+#endif
+ switch (state->statebits)
+ {
+ case 512: Skein_512_Update(&state->u.ctx_512,data,bCnt);
+ p = state->u.ctx_512.b;
+ break;
+ case 256: Skein_256_Update(&state->u.ctx_256,data,bCnt);
+ p = state->u.ctx_256.b;
+ break;
+ case 1024: Skein1024_Update(&state->u.ctx1024,data,bCnt);
+ p = state->u.ctx1024.b;
+ break;
+ default: return FAIL;
+ }
+ Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */
+ /* now "pad" the final partial byte the way NIST likes */
+ bCnt = state->u.h.bCnt; /* get the bCnt value (same location for all block sizes) */
+ Skein_assert(bCnt != 0); /* internal sanity check: there IS a partial byte in the buffer! */
+ mask = (u08b_t) (1u << (7 - (databitlen & 7))); /* partial byte bit mask */
+ p[bCnt-1] = (u08b_t)((p[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte (in the buffer) */
+
+ return SUCCESS;
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+int Skein_Final(hashState *state, BitSequence *hashval)
+ {
+ switch (state->statebits)
+ {
+ case 512: return Skein_512_Final(&state->u.ctx_512,hashval);
+ case 256: return Skein_256_Final(&state->u.ctx_256,hashval);
+ case 1024: return Skein1024_Final(&state->u.ctx1024,hashval);
+ default: return SKEIN_FAIL;
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+int Skein_Hash(int blkSize,int hashbitlen, const BitSequence *data, /* all-in-one call */
+ DataLength databitlen,BitSequence *hashval)
+ {
+ hashState state;
+ int r = Skein_Init(blkSize,&state,hashbitlen);
+ if (r == SKEIN_SUCCESS)
+ { /* these calls do not fail when called properly */
+ r = Skein_Update(&state,data,databitlen);
+ Skein_Final(&state,hashval);
+ }
+ return r;
+ }
+
+/***********************************************************************/
+/* various self-consistency checks */
+uint_t Skein_Test(uint_t blkSize,uint_t maxLen,uint_t hashLen,uint_t nStep,uint_t oneBlk)
+ {
+ enum { MAX_BUF=1024 };
+ u08b_t b[MAX_BUF+4],hashVal[2][MAX_BUF+4];
+ uint_t i,j,k,n,bCnt,useAHS,step,bitLen,testCnt=0;
+ hashState s[2];
+
+ assert(blkSize > 0 && blkSize <= 1024 && (blkSize % 256) == 0);
+ assert((hashLen % 8) == 0);
+
+ if (maxLen > MAX_BUF*8) /* keep things reasonably small */
+ maxLen = MAX_BUF*8;
+ if (hashLen > MAX_BUF*8)
+ hashLen = MAX_BUF*8;
+ if (maxLen == 0) /* default sizes */
+ maxLen = blkSize*2;
+ if (hashLen == 0)
+ hashLen = blkSize;
+
+ if (oneBlk)
+ {
+ if (oneBlk > MAX_BUF*8)
+ oneBlk = MAX_BUF*8;
+ for (i=0;i<oneBlk/8;i++)
+ b[i] = (u08b_t) i;
+ if (Skein_Hash(blkSize,hashLen,b,oneBlk,hashVal[0]) != SKEIN_SUCCESS)
+ FatalError("Skein_Hash != SUCCESS");
+ return 1;
+ }
+
+ if (nStep == 0)
+ {
+ printf("Testing Skein: blkSize = %4d bits. hashLen=%4d bits. maxMsgLen = %4d bits.\n",
+ blkSize,hashLen,maxLen);
+ nStep = 1;
+ }
+
+ n = skein_DebugFlag;
+ skein_DebugFlag = 0; /* turn of debug display for this "fake" AHS call */
+ if (Init(&s[0],hashLen) != SUCCESS) /* just see if AHS API supports this <blkSize,hashLen> pair */
+ FatalError("AHS_API Init() error!");
+ skein_DebugFlag = n; /* restore debug display status */
+
+ useAHS = (s[0].statebits == blkSize); /* does this <blkSize,hashLen> pair work via AHS_API? */
+
+ bCnt = (maxLen + 7) / 8; /* convert maxLen to bytes */
+ for (n=0;n < bCnt;n+=nStep) /* process all the data lengths (# bytes = n+1)*/
+ {
+ RandBytes(b,maxLen); /* get something to hash */
+ for (j=8;j>0;j--) /* j = # bits in final byte */
+ {
+ testCnt++;
+ memset(hashVal,0,sizeof(hashVal));
+ Show_Debug("\n*** Single Hash() call (%d bits)\n",8*n+j);
+ if (Skein_Hash(blkSize,hashLen,b,8*n+j,hashVal[0]) != SKEIN_SUCCESS)
+ FatalError("Skein_Hash != SUCCESS");
+ for (k=hashLen/8;k<=MAX_BUF;k++)
+ if (hashVal[0][k] != 0)
+ FatalError("Skein hash output overrun!: hashLen = %d bits",hashLen);
+ if (useAHS) /* compare using AHS API, if supported */
+ {
+ Show_Debug("\n*** Single AHS API Hash() call\n");
+ if (Hash(hashLen,b,8*n+j,hashVal[1]) != SUCCESS)
+ FatalError("Skein_Hash != SUCCESS");
+ for (k=hashLen/8;k<=MAX_BUF;k++)
+ if (hashVal[1][k] != 0)
+ FatalError("Skein AHS_API hash output overrun!: hashLen = %d bits",hashLen);
+ if (memcmp(hashVal[1],hashVal[0],hashLen/8))
+ FatalError("Skein vs. AHS API miscompare");
+ }
+ /* now try (randomized) steps thru entire input block */
+ for (i=0;i<4;i++)
+ {
+ Show_Debug("\n*** Multiple Update() calls [%s]",(i)?"random steps":"step==1");
+ if (i >= 2)
+ {
+ Show_Debug(" [re-use precomputed state]");
+ s[0] = s[1];
+ }
+ else
+ {
+ k = (i) ? Skein_Init (blkSize,&s[0],hashLen) :
+ Skein_InitExt(blkSize,&s[0],hashLen,SKEIN_CFG_TREE_INFO_SEQUENTIAL,NULL,0);
+ if (k != SKEIN_SUCCESS)
+ FatalError("Skein_Init != SUCCESS");
+ s[1] = s[0]; /* make a copy for next time */
+ }
+ Show_Debug("\n");
+ for (k=0;k<n+1;k+=step) /* step thru with variable sized steps */
+ {/* for i == 0, step one byte at a time. for i>0, randomly */
+ step = (i == 0) ? 1 : 1 + (Rand32() % (n+1-k)); /* # bytes to process */
+ bitLen = (k+step >= n+1) ? 8*(step-1) + j: 8*step; /* partial final byte handling */
+ if (Skein_Update(&s[0],&b[k],bitLen) != SKEIN_SUCCESS)
+ FatalError("Skein_Update != SUCCESS");
+ }
+ if (Skein_Final(&s[0],hashVal[1]) != SKEIN_SUCCESS)
+ FatalError("Skein_Final != SUCCESS");
+ for (k=hashLen/8;k<=MAX_BUF;k++)
+ if (hashVal[0][k] != 0)
+ FatalError("Skein hash output overrun!: hashLen = %d bits",hashLen);
+ if (memcmp(hashVal[1],hashVal[0],hashLen/8))
+ FatalError("Skein Hash() vs. Update() miscompare!");
+ }
+ }
+ }
+ return testCnt;
+ }
+
+/* filter out <blkSize,hashBits> pairs in short KAT mode */
+uint_t Short_KAT_OK(uint_t blkSize,uint_t hashBits)
+ {
+ switch (blkSize)
+ {
+ case 256:
+ if (hashBits != 256 && hashBits != 224)
+ return 0;
+ break;
+ case 512:
+ if (hashBits != 256 && hashBits != 384 && hashBits != 512)
+ return 0;
+ break;
+ case 1024:
+ if (hashBits != 384 && hashBits != 512 && hashBits != 1024)
+ return 0;
+ break;
+ default:
+ return 0;
+ }
+ return 1;
+ }
+
+#if SKEIN_TREE_HASH
+#define MAX_TREE_MSG_LEN (1 << 12)
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* pad final block, no OUTPUT stage */
+int Skein_Final_Pad(hashState *state, BitSequence *hashval)
+ {
+ switch (state->statebits)
+ {
+ case 512: return Skein_512_Final_Pad(&state->u.ctx_512,hashval);
+ case 256: return Skein_256_Final_Pad(&state->u.ctx_256,hashval);
+ case 1024: return Skein1024_Final_Pad(&state->u.ctx1024,hashval);
+ default: return SKEIN_FAIL;
+ }
+ }
+/* just the OUTPUT stage */
+int Skein_Output(hashState *state, BitSequence *hashval)
+ {
+ switch (state->statebits)
+ {
+ case 512: return Skein_512_Output(&state->u.ctx_512,hashval);
+ case 256: return Skein_256_Output(&state->u.ctx_256,hashval);
+ case 1024: return Skein1024_Output(&state->u.ctx1024,hashval);
+ default: return SKEIN_FAIL;
+ }
+ }
+
+/* generate a KAT test for the given data and tree parameters. */
+/* This is an "all-in-one" call. It is not intended to represent */
+/* how a real multi-processor version would be implemented, but */
+/* the results will be the same */
+void Skein_TreeHash
+ (uint_t blkSize,uint_t hashBits,const u08b_t *msg,size_t msgBytes,
+ uint_t leaf ,uint_t node ,uint_t maxLevel ,u08b_t *hashRes)
+ {
+ enum { MAX_HEIGHT = 32 }; /* how deep we can go here */
+ uint_t height;
+ uint_t blkBytes = blkSize/8;
+ uint_t saveDebug = skein_DebugFlag;
+ size_t n,nodeLen,srcOffs,dstOffs,bCnt;
+ u64b_t treeInfo;
+ u08b_t M[MAX_TREE_MSG_LEN+4];
+ hashState G,s;
+
+ assert(node < 256 && leaf < 256 && maxLevel < 256);
+ assert(node > 0 && leaf > 0 && maxLevel > 1 );
+ assert(blkSize == 256 || blkSize == 512 || blkSize == 1024);
+ assert(blkBytes <= sizeof(M));
+ assert(msgBytes <= sizeof(M));
+
+ /* precompute the config block result G for multiple uses below */
+#ifdef SKEIN_DEBUG
+ if (skein_DebugFlag)
+ skein_DebugFlag |= SKEIN_DEBUG_CONFIG;
+#endif
+ treeInfo = SKEIN_CFG_TREE_INFO(leaf,node,maxLevel);
+ if (Skein_InitExt(blkSize,&G,hashBits,treeInfo,NULL,0) != SKEIN_SUCCESS)
+ FatalError("Skein_InitExt() fails in tree");
+ skein_DebugFlag = saveDebug;
+
+ bCnt = msgBytes;
+ memcpy(M,msg,bCnt);
+ for (height=0;;height++) /* walk up the tree */
+ {
+ if (height && (bCnt==blkBytes)) /* are we done (with only one block left)? */
+ break;
+ if (height+1 == maxLevel) /* is this the final allowed level? */
+ { /* if so, do it as one big hash */
+ s = G;
+ Skein_Set_Tree_Level(s.u.h,height+1);
+ Skein_Update (&s,M,bCnt*8);
+ Skein_Final_Pad(&s,M);
+ break;
+ }
+ nodeLen = blkBytes << ((height) ? node : leaf);
+ for (srcOffs=dstOffs=0;srcOffs <= bCnt;)
+ {
+ n = bCnt - srcOffs; /* number of bytes left at this level */
+ if (n > nodeLen) /* limit to node size */
+ n = nodeLen;
+ s = G;
+ s.u.h.T[0] = srcOffs; /* nonzero initial offset in tweak! */
+ Skein_Set_Tree_Level(s.u.h,height+1);
+ Skein_Update (&s,M+srcOffs,n*8);
+ Skein_Final_Pad(&s,M+dstOffs); /* finish up this node, output intermediate result to M[]*/
+ dstOffs+=blkBytes;
+ srcOffs+=n;
+ if (srcOffs >= bCnt) /* special logic to handle (msgBytes == 0) case */
+ break;
+ }
+ bCnt = dstOffs;
+ }
+
+ /* output the result */
+ Skein_Output(&s,hashRes);
+ }
+
+/*
+** Generate tree-mode hash KAT vectors.
+** Note:
+** Tree vectors are different enough from non-tree vectors that it
+** makes sense to separate this out into a different function, rather
+** than shoehorn it into the same KAT logic as the other modes.
+**/
+void Skein_GenKAT_Tree(uint_t blkSize)
+ {
+ static const struct
+ {
+ uint_t leaf,node,maxLevel,levels;
+ }
+ TREE_PARMS[] = { {2,2,2,2}, {1,2,3,2}, {2,1,0xFF,3} };
+#define TREE_PARM_CNT (sizeof(TREE_PARMS)/sizeof(TREE_PARMS[0]))
+
+ u08b_t msg[MAX_TREE_MSG_LEN+4],hashVal[MAX_TREE_MSG_LEN+4];
+ uint_t i,j,k,n,p,q,hashBits,node,leaf,leafBytes,msgBytes,byteCnt,levels,maxLevel;
+
+ assert(blkSize == 256 || blkSize == 512 || blkSize == 1024);
+ for (i=0;i<MAX_TREE_MSG_LEN;i+=2)
+ { /* generate "incrementing" tree hash input msg data */
+ msg[i ] = (u08b_t) ((i ^ blkSize) ^ (i >> 16));
+ msg[i+1] = (u08b_t) ((i ^ blkSize) >> 8);
+ }
+ for (k=q=n=0;k < HASH_BITS_CNT;k++)
+ {
+ hashBits = HASH_BITS[k];
+ if (!Short_KAT_OK(blkSize,hashBits))
+ continue;
+ if ((verbose & V_KAT_SHORT) && (hashBits != blkSize))
+ continue;
+ for (p=0;p <TREE_PARM_CNT;p++)
+ {
+ if (p && (verbose & V_KAT_SHORT))
+ continue; /* keep short KATs short */
+ if (p && hashBits != blkSize)
+ continue; /* we only need one "non-full" size */
+
+ leaf = TREE_PARMS[p].leaf;
+ node = TREE_PARMS[p].node;
+ maxLevel = TREE_PARMS[p].maxLevel;
+ levels = TREE_PARMS[p].levels;
+ leafBytes = (blkSize/8) << leaf; /* number of bytes in a "full" leaf */
+
+ for (j=0;j<4;j++) /* different numbers of leaf results */
+ {
+ if ((verbose & V_KAT_SHORT) && (j != 3) && (j != 0))
+ continue;
+ if (j && (hashBits != blkSize))
+ break;
+ switch (j)
+ {
+ case 0: n = 1; break;
+ case 1: n = 2; break;
+ case 2: n = (1 << (node * (levels-2)))*3/2;
+ if (n <= 2) continue; break;
+ case 3: n = (1 << (node * (levels-1))); break;
+ }
+ byteCnt = n*leafBytes;
+ assert(byteCnt > 0);
+ if (byteCnt > MAX_TREE_MSG_LEN)
+ continue;
+ q = (q+1) % leafBytes;
+ msgBytes = byteCnt - q;
+ switch (blkSize)
+ {
+ case 256: printf("\n:Skein-256: "); break;
+ case 512: printf("\n:Skein-512: "); break;
+ case 1024: printf("\n:Skein-1024:"); break;
+ }
+ printf(" %4d-bit hash, msgLen =%6d bits",hashBits,msgBytes*8);
+ printf(". Tree: leaf=%02X, node=%02X, maxLevels=%02X\n",leaf,node,maxLevel);
+ printf("\nMessage data:\n");
+ if (msgBytes == 0)
+ printf(" (none)\n");
+ else
+ ShowBytes(msgBytes,msg);
+
+ Skein_TreeHash(blkSize,hashBits,msg,msgBytes,leaf,node,maxLevel,hashVal);
+
+ printf("Result:\n");
+ ShowBytes((hashBits+7)/8,hashVal);
+ printf("--------------------------------\n");
+ }
+ }
+ }
+ }
+#endif
+
+/*
+** Output some KAT values. This output is generally re-directed to a file and
+** can be compared across platforms to help validate an implementation on a
+** new platform (or compare reference vs. optimized code, for example). The
+** file will be provided as part of the Skein submission package to NIST.
+**
+** When used in conjunction with the debug flag, this will output a VERY long
+** result. The verbose flag is used to output even more combinations of
+** <blkSize,hashSize,msgLen>
+**
+** Note: this function does NOT output the NIST AHS KAT format.
+*/
+void Skein_ShowKAT(uint_t blkSizeMask)
+ {
+ enum
+ {
+ DATA_TYPE_ZERO = 0,
+ DATA_TYPE_INC,
+ DATA_TYPE_RAND,
+ DATA_TYPE_MAC,
+ DATA_TYPE_TREE,
+ DATA_TYPE_CNT,
+
+ MAX_BYTES = 3*1024/8
+ };
+ static const char *TYPE_NAMES[] = { "zero","incrementing","random","random+MAC","tree",NULL };
+ static const uint_t MSG_BITS[] =
+ { 0,1,2,3,4,5,6,7,8,9,10,32,64,128,192,
+ 256-1, 256, 256+1, 384,
+ 512-1, 512, 512+1, 768,
+ 1024-1,1024,1024+1,
+ 2048-1,2048,2048+1
+ };
+#define MSG_BITS_CNT (sizeof(MSG_BITS)/sizeof(MSG_BITS[0]))
+
+ uint_t i,j,k,blkSize,dataType,hashBits,msgBits,keyBytes,blkBytes,keyType;
+ u08b_t data[MAX_BYTES+4],key[MAX_BYTES+4],hashVal[MAX_BYTES+4];
+ const char *msgType;
+ hashState s;
+
+ Rand_Init(SKEIN_MK_64(0xDEADBEEF,0)); /* init PRNG with repeatable value */
+ katHash = ~0u;
+ keyType = 0;
+
+#ifdef SKEIN_DEBUG
+ /* first, show some "raw" Threefish + feedforward block calls, with round-by-round debug info if enabled */
+ if (skein_DebugFlag && !(verbose & V_KAT_NO_3FISH))
+ {
+ k = skein_DebugFlag; /* save debug flag value */
+ skein_DebugFlag = THREEFISH_DEBUG_ALL & ~ SKEIN_DEBUG_HDR; /* turn on full debug detail, use Threefish name */
+ skein_DebugFlag |= (k & SKEIN_DEBUG_PERMUTE);
+#else
+ if (verbose & V_KAT_DO_3FISH) /* allow non-SKEIN_DEBUG testing */
+ {
+#endif
+ for (blkSize = 256;blkSize <= 1024; blkSize*=2)
+ {
+ if (blkSizeMask && (blkSize & blkSizeMask) == 0)
+ continue;
+ for (dataType=DATA_TYPE_ZERO; dataType <= DATA_TYPE_INC; dataType++)
+ {
+ switch (dataType)
+ {
+ case DATA_TYPE_ZERO:
+ memset(data,0,sizeof(data));
+ memset(key ,0,sizeof(key));
+ break;
+ case DATA_TYPE_INC:
+ for (i=0;i<MAX_BYTES;i++)
+ {
+ key [i] = (u08b_t) i ;
+ data[i] = (u08b_t) ~key[i];
+ }
+ break;
+ default:
+ continue;
+ }
+#ifdef SKEIN_DEBUG
+ switch (blkSize)
+ {
+ case 256: printf("\n:Threefish-256: "); break;
+ case 512: printf("\n:Threefish-512: "); break;
+ case 1024: printf("\n:Threefish-1024:"); break;
+ }
+ printf(" encryption + plaintext feedforward (round-by-round):\n");
+#endif
+ memset(&s,0,sizeof(s));
+ s.u.h.hashBitLen = blkSize;
+ Skein_Get64_LSB_First(s.u.h.T ,key,2); /* init T[] */
+ Skein_Get64_LSB_First(s.u.ctx1024.X,key+2*8,blkSize/64); /* init X[] */
+ switch (blkSize)
+ {
+ case 256: Skein_256_Process_Block(&s.u.ctx_256,data,1,0); break;
+ case 512: Skein_512_Process_Block(&s.u.ctx_512,data,1,0); break;
+ case 1024: Skein1024_Process_Block(&s.u.ctx1024,data,1,0); break;
+ }
+#ifdef SKEIN_DEBUG
+ printf("++++++++++++++++++++++++++++++++++++++\n");
+#endif
+ }
+ }
+#ifdef SKEIN_DEBUG
+ skein_DebugFlag = k;
+#endif
+ }
+
+ for (dataType=DATA_TYPE_ZERO; dataType < DATA_TYPE_CNT; dataType++)
+ {
+ msgType = TYPE_NAMES[dataType];
+ switch (dataType)
+ {
+ case DATA_TYPE_ZERO:
+ memset(data,0,sizeof(data));
+ memset(key ,0,sizeof(key));
+ break;
+ case DATA_TYPE_INC:
+ for (i=0;i<MAX_BYTES;i++)
+ {
+ key [i] = (u08b_t) i ;
+ data[i] = (u08b_t) ~key[i];
+ }
+ break;
+ case DATA_TYPE_MAC:
+ RandBytes(key ,sizeof(key ));
+ case DATA_TYPE_RAND:
+ RandBytes(data,sizeof(data));
+ break;
+ case DATA_TYPE_TREE:
+ if (verbose & V_KAT_NO_TREE)
+ continue;
+ break;
+ default: /* should never get here */
+ FatalError("Invalid data type: %d --> '%s'",dataType,msgType);
+ break;
+ }
+
+ for (blkSize = 256;blkSize <= 1024; blkSize*=2)
+ {
+ if (blkSizeMask && (blkSize & blkSizeMask) == 0)
+ continue;
+ if (dataType == DATA_TYPE_TREE)
+ {
+#if SKEIN_TREE_HASH
+ Skein_GenKAT_Tree(blkSize);
+#endif
+ continue;
+ }
+ if (verbose & V_KAT_NO_SEQ)
+ continue;
+ blkBytes = blkSize/8;
+ for (j=0;j < MSG_BITS_CNT;j++)
+ for (k=0;k < HASH_BITS_CNT;k++)
+ {
+ msgBits = MSG_BITS[j]; /* message length */
+ hashBits = HASH_BITS[k]; /* hash result size */
+ assert(MAX_BYTES*8 >= hashBits && MAX_BYTES*8 >= msgBits);
+ if (msgBits != 1024 && hashBits != blkSize && !(verbose & V_KAT_LONG))
+ continue; /* keep the output size reasonable, unless verbose */
+ if (verbose & V_KAT_SHORT)
+ { /* -v2 ==> generate "short" KAT set by filtering out most vectors */
+ if (dataType != DATA_TYPE_INC)
+ continue;
+ if (msgBits != 8 && msgBits != blkSize && msgBits != 2*blkSize)
+ continue;
+ if (!Short_KAT_OK(blkSize,hashBits))
+ continue;
+ }
+ switch (blkSize)
+ {
+ case 256: printf("\n:Skein-256: "); break;
+ case 512: printf("\n:Skein-512: "); break;
+ case 1024: printf("\n:Skein-1024:"); break;
+ }
+ printf(" %4d-bit hash, msgLen =%6d bits",hashBits,msgBits);
+ if (!(verbose & V_KAT_SHORT))
+ printf(", data = '%s'",msgType);
+ printf("\n\nMessage data:\n");
+ if (msgBits == 0)
+ printf(" (none)\n");
+ else
+ ShowBytes((msgBits+7)/8,data);
+ switch (dataType)
+ {
+ default: /* straight hash value */
+ if (Skein_Hash(blkSize,hashBits,data,msgBits,hashVal) != SKEIN_SUCCESS)
+ FatalError("Skein_Hash() error!");
+ break;
+ case DATA_TYPE_MAC: /* include some MAC computations in KAT file */
+ switch (keyType++) /* sequence thru different MAC key lengths */
+ {
+ case 0: keyBytes = blkBytes/2; break;
+ case 1: keyBytes = blkBytes; break;
+ case 2: keyBytes = blkBytes +1; break;
+ case 3: keyBytes = blkBytes*2+1; break;
+ default:keyBytes = 0; /* not actually a MAC this time, but use InitExt() */
+ keyType = 0; /* start the cycle again next time */
+ }
+ printf("MAC key = %4d bytes:\n",keyBytes);
+ if (keyBytes) /* show MAC key, if any */
+ ShowBytes(keyBytes,key);
+ else
+ printf(" (none) /* use InitExt() call */\n");
+
+ if (Skein_InitExt(blkSize,&s,hashBits,SKEIN_CFG_TREE_INFO_SEQUENTIAL,key,keyBytes) != SKEIN_SUCCESS)
+ FatalError("Skein_InitExt() error!");
+ if (Skein_Update(&s,data,msgBits) != SKEIN_SUCCESS)
+ FatalError("Skein_Update() error!");
+ if (Skein_Final(&s,hashVal) != SKEIN_SUCCESS)
+ FatalError("Skein_Final() error!");
+ break;
+ case DATA_TYPE_TREE:
+ assert(0);
+ break;
+ }
+ printf("Result:\n");
+ ShowBytes((hashBits+7)/8,hashVal);
+ printf("--------------------------------\n");
+ }
+ }
+ }
+ if (!_quiet_)
+ fprintf(stderr,"katHash = %08X\n",katHash ^ 0x150183D2);
+ }
+
+/* generate pre-computed IVs for inclusion in Skein C code */
+void Skein_GenerateIV(void)
+ {
+ static const struct
+ { uint_t blkSize,hashBits; }
+ IV_TAB[] = /* which pairs to precompute */
+ { { 256, 128 }, { 256, 160 }, { 256, 224 }, { 256, 256 },
+ { 512, 128 }, { 512, 160 }, { 512, 224 }, { 512, 256 },
+ { 512, 384 }, { 512, 512 },
+ {1024, 384 }, {1024, 512 }, {1024,1024 }
+ };
+ uint_t i,j,blkSize,hashBits;
+ hashState state;
+ const u64b_t *w;
+ const char *s;
+
+ printf("#ifndef _SKEIN_IV_H_\n"
+ "#define _SKEIN_IV_H_\n\n"
+ "#include \"skein.h\" /* get Skein macros and types */\n\n"
+ "/*\n"
+ "***************** Pre-computed Skein IVs *******************\n"
+ "**\n"
+ "** NOTE: these values are not \"magic\" constants, but\n"
+ "** are generated using the Threefish block function.\n"
+ "** They are pre-computed here only for speed; i.e., to\n"
+ "** avoid the need for a Threefish call during Init().\n"
+ "**\n"
+ "** The IV for any fixed hash length may be pre-computed.\n"
+ "** Only the most common values are included here.\n"
+ "**\n"
+ "************************************************************\n"
+ "**/\n\n"
+ "#define MK_64 SKEIN_MK_64\n\n"
+ );
+ for (i=0;i < sizeof(IV_TAB)/sizeof(IV_TAB[0]); i++)
+ {
+ blkSize = IV_TAB[i].blkSize;
+ hashBits = IV_TAB[i].hashBits;
+ switch (blkSize)
+ {
+ case 256: w = state.u.ctx_256.X; s = "_256"; break;
+ case 512: w = state.u.ctx_512.X; s = "_512"; break;
+ case 1024: w = state.u.ctx1024.X; s = "1024"; break;
+ default: FatalError("Invalid blkSize");
+ continue; /* should never happen, but avoids gcc warning */
+ }
+ if (Skein_Init(blkSize,&state,hashBits) != SKEIN_SUCCESS)
+ FatalError("Error generating IV: blkSize=%d, hashBits=%d",blkSize,hashBits);
+ printf("/* blkSize = %4d bits. hashSize = %4d bits */\n",blkSize,hashBits);
+ printf("const u64b_t SKEIN%s_IV_%d[] =\n {\n",s,hashBits);
+ for (j=0;j<blkSize/64;j++)
+ printf(" MK_64(0x%08X,0x%08X)%s\n",
+ (uint_32t)(w[j] >> 32),(uint_32t)w[j],(j+1 == blkSize/64)?"":",");
+ printf(" };\n\n");
+ }
+ printf("#endif /* _SKEIN_IV_H_ */\n");
+ }
+
+/* qsort routine */
+int compare_uint_32t(const void *aPtr,const void *bPtr)
+ {
+ uint_32t a = * ((uint_32t *) aPtr);
+ uint_32t b = * ((uint_32t *) bPtr);
+
+ if (a > b) return 1;
+ if (a < b) return -1;
+ return 0;
+ }
+
+void ShowCompiler(const char *CVER)
+ {
+ printf(" //:");
+#if defined(SKEIN_XMM)
+ printf(" 32-XMM, ");
+#else
+ printf(" %2u-bit, ",(uint_t)(8*sizeof(size_t)));
+#endif
+ printf("%s%s",COMPILER_ID,CVER);
+
+ /* do we need to show unroll amount? */
+#if defined(SKEIN_USE_ASM) && SKEIN_USE_ASM
+ printf(" [asm=");
+#define _SC_DO_LOOP_ (1)
+#elif defined(SKEIN_LOOP)
+ printf(" [ C =");
+#define _SC_DO_LOOP_ (1)
+#endif
+
+#ifdef _SC_DO_LOOP_
+ printf("%c",(Skein_256_Unroll_Cnt())?'0'+Skein_256_Unroll_Cnt():'.');
+ printf("%c",(Skein_512_Unroll_Cnt())?'0'+Skein_512_Unroll_Cnt():'.');
+ printf("%c",(Skein1024_Unroll_Cnt())?'0'+Skein1024_Unroll_Cnt():'.');
+ printf("]");
+#endif
+ }
+
+/* measure the speed (in CPU clks/byte) for a Skein implementation */
+void Skein_MeasurePerformance(const char *target)
+ {
+ const uint_t MSG_BYTES[] = {1,2,4,8,10,16,32,64,100,128,256,512,1000,1024,2048,4096,8192,10000,16384,32768,100000,0};
+ enum { TIMER_SAMPLE_CNT = 13, MAX_BUFFER=1024*100, PERF_TIMEOUT_CLKS = 500000 };
+ enum { _256 = 256, _512 = 512 };
+ uint_32t dt[24][3][TIMER_SAMPLE_CNT],t0,t1;
+ uint_32t dtMin = ~0u;
+ uint_t targetSize = 0;
+ uint_t repCnt = 1;
+ uint_t i,k,n,r,blkSize,msgBytes;
+ u08b_t b[MAX_BUFFER],hashVal[SKEIN1024_BLOCK_BYTES*4];
+ hashState s;
+#ifdef CompilerVersion
+ char CVER[20]; /* avoid ANSI compiler warnings for sprintf()! :-(( */
+ n = CompilerVersion;
+ CVER[0] = '_';
+ CVER[1] = 'v';
+ CVER[2] = (char)('0'+((n /100)%10));
+ CVER[3] = '.';
+ CVER[4] = (char)('0'+((n / 10)%10));
+ CVER[5] = (char)('0'+((n / 1)%10));
+ CVER[6] = 0;
+#else
+#define CVER ""
+#endif
+ if (target && target[0])
+ {
+ targetSize = atoi(target);
+ for (i=0;target[i];i++)
+ if (target[i] == '.')
+ {
+ repCnt = atoi(target+i+1);
+ break;
+ }
+ if (repCnt == 0)
+ repCnt = 1;
+ }
+
+ assert(sizeof(dt)/(3*TIMER_SAMPLE_CNT*sizeof(dt[0][0][0])) >=
+ sizeof(MSG_BYTES)/sizeof(MSG_BYTES[0]));
+ if (OS_Set_High_Priority())
+ printf("Unable to set thread to high priority\n");
+ fflush(stdout); /* let things calm down */
+ OS_Sleep(200); /* let things settle down for a bit */
+ memset(dt,0,sizeof(dt));
+ RandBytes(b,sizeof(b)); /* use random data for testing */
+ for (i=0;i<4*TIMER_SAMPLE_CNT;i++) /* calibrate the overhead for measuring time */
+ {
+ t0 = HiResTime();
+ t1 = HiResTime();
+ if (dtMin > t1-t0) /* keep only the minimum time */
+ dtMin = t1-t0;
+ }
+ for (r=0;r<repCnt;r++)
+ {
+ /* first take all the data and store it in dt, with no printf() activity */
+ for (n=0;n < sizeof(MSG_BYTES)/sizeof(MSG_BYTES[0]);n++)
+ {
+ msgBytes = MSG_BYTES[n]; /* pick the message size (in bits) */
+ if (msgBytes > MAX_BUFFER || msgBytes == 0)
+ break;
+ if (targetSize && targetSize != msgBytes)
+ continue;
+ for (k=0;k<3;k++)
+ { /* cycle thru the different block sizes */
+ blkSize=256 << k;
+ t0=HiResTime();
+ t1=HiResTime();
+#define OneTest(BITS) \
+ Skein##BITS##_Init (&s.u.ctx##BITS,BITS); \
+ Skein##BITS##_Update(&s.u.ctx##BITS,b,msgBytes);\
+ Skein##BITS##_Final (&s.u.ctx##BITS,hashVal);
+
+ OS_Sleep(0); /* yield the time slice to OS */
+ for (i=0;i<TIMER_SAMPLE_CNT;i++)
+ {
+ HiResTime(); /* prime the pump */
+ switch (blkSize)
+ {
+ case 256:
+ OneTest(_256); /* prime the pump */
+ t0 = HiResTime();
+ OneTest(_256); /* do it twice for some averaging */
+ OneTest(_256);
+ t1 = HiResTime();
+ break;
+ case 512:
+ OneTest(_512);
+ t0 = HiResTime();
+ OneTest(_512);
+ OneTest(_512);
+ t1 = HiResTime();
+ break;
+ case 1024:
+ OneTest(1024);
+ t0 = HiResTime();
+ OneTest(1024);
+ OneTest(1024);
+ t1 = HiResTime();
+ break;
+ }
+ dt[n][k][i] = ((t1 - t0) - dtMin)/2; /* adjust for HiResTime() overhead */
+ }
+ }
+ }
+ OS_Set_Normal_Priority();
+
+ if (targetSize == 0)
+ {
+ printf("\nSkein performance, in clks per byte, dtMin = %4d clks.\n",dtMin);
+ printf(" [compiled %s,%s by '%s%s', %u-bit]\n",__TIME__,__DATE__,COMPILER_ID,CVER,(uint_t)(8*sizeof(size_t)));
+ printf(" =================================================================\n");
+ printf(" || Skein block size |\n");
+ printf(" ||--------------------------------------------------------------|\n");
+ printf(" Message || 256 bits | 512 bits | 1024 bits |\n");
+ printf(" Length ||====================|====================|====================|\n");
+ printf(" (bytes) || min median | min median | min median |\n");
+ printf("=========||====================|====================|====================|\n");
+ }
+
+ /* now display the results */
+ for (n=0;n < sizeof(MSG_BYTES)/sizeof(MSG_BYTES[0]);n++)
+ {
+ msgBytes = MSG_BYTES[n]; /* pick the message size (in bits) */
+ if (msgBytes > MAX_BUFFER || msgBytes == 0)
+ break;
+ if (targetSize && targetSize != msgBytes)
+ continue;
+ printf("%7d_ ||",msgBytes);
+ for (k=0;k<3;k++) /* cycle thru the different Skein block sizes */
+ { /* here with dt[n][k][] full of time differences */
+ /* discard high/low, then show min/median of the rest, in clks/byte */
+ qsort(dt[n][k],TIMER_SAMPLE_CNT,sizeof(dt[0][0][0]),compare_uint_32t);
+ printf(" %8.2f %8.2f |",dt[n][k][1]/(double)msgBytes,dt[n][k][TIMER_SAMPLE_CNT/2]/(double)msgBytes);
+ }
+ ShowCompiler(CVER);
+ printf("\n");
+ if (targetSize == 0 && target && target[0] && repCnt == 1)
+ { /* show the details */
+ for (k=0;k<3;k++)
+ {
+ printf("%4d: ",256 << k);
+ for (i=0;i<TIMER_SAMPLE_CNT;i++)
+ printf("%8d",dt[n][k][i]);
+ printf("\n");
+ }
+ }
+ }
+ }
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+ if (targetSize == 0)
+ {
+ printf("=========||====================|====================|====================|\n");
+ printf("Code Size|| | | |\n");
+ printf("=========||====================|====================|====================|\n");
+ printf(" API || %12d bytes | %12d bytes | %12d bytes |",
+ (int) Skein_256_API_CodeSize(),
+ (int) Skein_512_API_CodeSize(),
+ (int) Skein1024_API_CodeSize());
+ ShowCompiler(CVER);
+ printf("\n");
+ printf(" Block || %12d bytes | %12d bytes | %12d bytes |",
+ (int) Skein_256_Process_Block_CodeSize(),
+ (int) Skein_512_Process_Block_CodeSize(),
+ (int) Skein1024_Process_Block_CodeSize());
+ ShowCompiler(CVER);
+ printf("\n");
+ }
+#endif
+ }
+
+void GiveHelp(void)
+ {
+ printf("Syntax: skein_test [options]\n"
+ "Options: -bNN = set Skein block size to NN bits\n"
+ " -lNN = set max test length to NN bits\n"
+ " -tNN = set Skein hash length to NN bits\n"
+ " -sNN = set initial random seed\n"
+ " -g = generate precomputed IV values to stdout\n"
+ " -k = output KAT results to stdout\n"
+ " -p = output performance (clks/byte)\n"
+ );
+ exit(2);
+ }
+
+int main(int argc,char *argv[])
+ {
+ int i,n;
+ uint_t testCnt;
+ uint_t doKAT = 0; /* generate KAT vectors? */
+ uint_t blkSize = 0; /* Skein state size in bits */
+ uint_t maxLen = 1024; /* max block size in bits */
+ uint_t hashLen = 0; /* hash length in bits (0 --> all) */
+ uint_t seed0 = (uint_t) time(NULL); /* randomize based on time */
+ uint_t oneBlk = 0; /* test block size */
+
+ for (i=1;i<argc;i++)
+ { /* process command-line switches */
+ if (argv[i][0] == '-')
+ {
+ switch(toupper(argv[i][1]))
+ {
+ case '?': GiveHelp(); break;
+ case 'B': blkSize |= atoi(argv[i]+2); break;
+ case 'L': maxLen = atoi(argv[i]+2); break;
+ case 'S': seed0 = atoi(argv[i]+2); break;
+ case 'T': hashLen = atoi(argv[i]+2); break;
+ case 'K': doKAT = 1; break;
+ case 'V': verbose |= (argv[i][2]) ? atoi(argv[i]+2) : V_KAT_LONG; break;
+ case 'G': Skein_GenerateIV(); return 0;
+ case 'P': Skein_MeasurePerformance(argv[i]+2);return 0;
+ case 'Q': _quiet_ = 1; break;
+ case 'D': switch (toupper(argv[i][2]))
+ {
+#ifdef SKEIN_DEBUG
+ case 0 : skein_DebugFlag |= SKEIN_DEBUG_DEFAULT; break;
+ case '-': skein_DebugFlag |= SKEIN_DEBUG_SHORT; break;
+ case '+': skein_DebugFlag |= SKEIN_DEBUG_ALL; break;
+ case 'P': skein_DebugFlag |= SKEIN_DEBUG_PERMUTE; break;
+ case 'I': skein_DebugFlag |= SKEIN_DEBUG_SHORT | SKEIN_DEBUG_INJECT; break;
+ case 'C': skein_DebugFlag |= SKEIN_DEBUG_SHORT & ~SKEIN_DEBUG_CONFIG; break;
+#endif
+ default : skein_DebugFlag |= atoi(argv[i]+2); break;
+ }
+ break;
+ default: FatalError("Unsupported command-line option: %s",argv[i]);
+ break;
+ }
+ }
+ else if (argv[i][0] == '?')
+ GiveHelp();
+ else if (isdigit(argv[i][0]))
+ oneBlk = atoi(argv[i]);
+ }
+
+ if (blkSize == 0) /* default is all block sizes */
+ blkSize = 256 | 512 | 1024;
+ if (doKAT)
+ {
+ Skein_ShowKAT(blkSize);
+ }
+ else
+ {
+ if (oneBlk == 0)
+ printf("Seed0 = %d. Compiler = %s\n",seed0,COMPILER_ID);
+ Rand_Init(SKEIN_MK_64(0xDEADBEEF,seed0)); /* init PRNG for test data */
+
+ testCnt=0;
+ for (i=256;i<=1024;i*=2)
+ {
+ if (blkSize & i)
+ {
+ if (hashLen == 0) /* use all hash sizes? */
+ {
+ for (n=0;n < HASH_BITS_CNT;n++)
+ testCnt += Skein_Test(i,maxLen,HASH_BITS[n],0,oneBlk);
+ }
+ else
+ testCnt += Skein_Test(i,maxLen,hashLen,0,oneBlk);
+ }
+ }
+ if (oneBlk)
+ return 0;
+ if (testCnt)
+ printf("Success: %4d tests\n",testCnt);
+ }
+ /* do a quick final self-consistentcy check test to make sure nothing is broken */
+ skein_DebugFlag = 0; /* no debug output here */
+ for (blkSize = 256;blkSize <= 1024; blkSize*=2)
+ {
+ Skein_Test(blkSize,16,0,1,0);
+ }
+
+ return 0;
+ }
diff --git a/Optimized_32bit/SHA3api_ref.c b/Optimized_32bit/SHA3api_ref.c
new file mode 100644
index 000000000000..6861a3e4bffb
--- /dev/null
+++ b/Optimized_32bit/SHA3api_ref.c
@@ -0,0 +1,115 @@
+/***********************************************************************
+**
+** Implementation of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include <string.h> /* get the memcpy/memset functions */
+#include "skein.h" /* get the Skein API definitions */
+#include "SHA3api_ref.h"/* get the AHS API definitions */
+
+/******************************************************************/
+/* AHS API code */
+/******************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* select the context size and init the context */
+HashReturn Init(hashState *state, int hashbitlen)
+ {
+#if SKEIN_256_NIST_MAX_HASH_BITS
+ if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+ {
+ Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
+ state->statebits = 64*SKEIN_256_STATE_WORDS;
+ return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
+ }
+#endif
+ if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+ {
+ state->statebits = 64*SKEIN_512_STATE_WORDS;
+ return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
+ }
+ else
+ {
+ state->statebits = 64*SKEIN1024_STATE_WORDS;
+ return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process data to be hashed */
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+ {
+ /* only the final Update() call is allowed do partial bytes, else assert an error */
+ Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL);
+
+ Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+ if ((databitlen & 7) == 0) /* partial bytes? */
+ {
+ switch ((state->statebits >> 8) & 3)
+ {
+ case 2: return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
+ case 1: return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
+ case 0: return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
+ default: return FAIL;
+ }
+ }
+ else
+ { /* handle partial final byte */
+ size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */
+ u08b_t b,mask;
+
+ mask = (u08b_t) (1u << (7 - (databitlen & 7))); /* partial byte bit mask */
+ b = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte */
+
+ switch ((state->statebits >> 8) & 3)
+ {
+ case 2: Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte */
+ Skein_512_Update(&state->u.ctx_512,&b , 1 ); /* process the (masked) partial byte */
+ break;
+ case 1: Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte */
+ Skein_256_Update(&state->u.ctx_256,&b , 1 ); /* process the (masked) partial byte */
+ break;
+ case 0: Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte */
+ Skein1024_Update(&state->u.ctx1024,&b , 1 ); /* process the (masked) partial byte */
+ break;
+ default: return FAIL;
+ }
+ Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */
+
+ return SUCCESS;
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize hash computation and output the result (hashbitlen bits) */
+HashReturn Final(hashState *state, BitSequence *hashval)
+ {
+ Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
+ switch ((state->statebits >> 8) & 3)
+ {
+ case 2: return Skein_512_Final(&state->u.ctx_512,hashval);
+ case 1: return Skein_256_Final(&state->u.ctx_256,hashval);
+ case 0: return Skein1024_Final(&state->u.ctx1024,hashval);
+ default: return FAIL;
+ }
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* all-in-one hash function */
+HashReturn Hash(int hashbitlen, const BitSequence *data, /* all-in-one call */
+ DataLength databitlen,BitSequence *hashval)
+ {
+ hashState state;
+ HashReturn r = Init(&state,hashbitlen);
+ if (r == SUCCESS)
+ { /* these calls do not fail when called properly */
+ r = Update(&state,data,databitlen);
+ Final(&state,hashval);
+ }
+ return r;
+ }
diff --git a/Optimized_32bit/SHA3api_ref.h b/Optimized_32bit/SHA3api_ref.h
new file mode 100644
index 000000000000..6d62304e59b7
--- /dev/null
+++ b/Optimized_32bit/SHA3api_ref.h
@@ -0,0 +1,66 @@
+#ifndef _AHS_API_H_
+#define _AHS_API_H_
+
+/***********************************************************************
+**
+** Interface declarations of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include "skein.h"
+
+typedef enum
+ {
+ SUCCESS = SKEIN_SUCCESS,
+ FAIL = SKEIN_FAIL,
+ BAD_HASHLEN = SKEIN_BAD_HASHLEN
+ }
+ HashReturn;
+
+typedef size_t DataLength; /* bit count type */
+typedef u08b_t BitSequence; /* bit stream type */
+
+typedef struct
+ {
+ uint_t statebits; /* 256, 512, or 1024 */
+ union
+ {
+ Skein_Ctxt_Hdr_t h; /* common header "overlay" */
+ Skein_256_Ctxt_t ctx_256;
+ Skein_512_Ctxt_t ctx_512;
+ Skein1024_Ctxt_t ctx1024;
+ } u;
+ }
+ hashState;
+
+/* "incremental" hashing API */
+HashReturn Init (hashState *state, int hashbitlen);
+HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
+HashReturn Final (hashState *state, BitSequence *hashval);
+
+/* "all-in-one" call */
+HashReturn Hash (int hashbitlen, const BitSequence *data,
+ DataLength databitlen, BitSequence *hashval);
+
+
+/*
+** Re-define the compile-time constants below to change the selection
+** of the Skein state size in the Init() function in SHA3api_ref.c.
+**
+** That is, the NIST API does not allow for explicit selection of the
+** Skein block size, so it must be done implicitly in the Init() function.
+** The selection is controlled by these constants.
+*/
+#ifndef SKEIN_256_NIST_MAX_HASHBITS
+#define SKEIN_256_NIST_MAX_HASHBITS (0)
+#endif
+
+#ifndef SKEIN_512_NIST_MAX_HASHBITS
+#define SKEIN_512_NIST_MAX_HASHBITS (512)
+#endif
+
+#endif /* ifdef _AHS_API_H_ */
diff --git a/Optimized_32bit/brg_endian.h b/Optimized_32bit/brg_endian.h
new file mode 100644
index 000000000000..978eb33f08cf
--- /dev/null
+++ b/Optimized_32bit/brg_endian.h
@@ -0,0 +1,148 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+ 1. distributions of this source code include the above copyright
+ notice, this list of conditions and the following disclaimer;
+
+ 2. distributions in binary form include the above copyright
+ notice, this list of conditions and the following disclaimer
+ in the documentation and/or other associated materials;
+
+ 3. the copyright holder's name is not used to endorse products
+ built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+# include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+ defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+# include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+# if !defined( __MINGW32__ ) && !defined(AVR)
+# include <endian.h>
+# if !defined( __BEOS__ )
+# include <byteswap.h>
+# endif
+# endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
+/* seem to encompass most endian symbol definitions */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( BIG_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( _BIG_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( __BIG_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif defined( __BIG_ENDIAN__ )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/* if the platform byte order could not be determined, then try to */
+/* set this define using common machine defines */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
+ defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
+ defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
+ defined( vax ) || defined( vms ) || defined( VMS ) || \
+ defined( __VMS ) || defined( _M_X64 ) || defined( AVR )
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
+ defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
+ defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
+ defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
+ defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
+ defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
+ defined( THINK_C ) || defined( __VMCMS__ )
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0 /* **** EDIT HERE IF NECESSARY **** */
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0 /* **** EDIT HERE IF NECESSARY **** */
+# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+#endif
+
+/* special handler for IA64, which may be either endianness (?) */
+/* here we assume little-endian, but this may need to be changed */
+#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+# define PLATFORM_MUST_ALIGN (1)
+#ifndef PLATFORM_BYTE_ORDER
+# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#endif
+
+#ifndef PLATFORM_MUST_ALIGN
+# define PLATFORM_MUST_ALIGN (0)
+#endif
+
+#endif /* ifndef BRG_ENDIAN_H */
diff --git a/Optimized_32bit/brg_types.h b/Optimized_32bit/brg_types.h
new file mode 100644
index 000000000000..d6d6cdab9fbf
--- /dev/null
+++ b/Optimized_32bit/brg_types.h
@@ -0,0 +1,188 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+ 1. distributions of this source code include the above copyright
+ notice, this list of conditions and the following disclaimer;
+
+ 2. distributions in binary form include the above copyright
+ notice, this list of conditions and the following disclaimer
+ in the documentation and/or other associated materials;
+
+ 3. the copyright holder's name is not used to endorse products
+ built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'. These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable. However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#ifndef BRG_UI8
+# define BRG_UI8
+# if UCHAR_MAX == 255u
+ typedef unsigned char uint_8t;
+# else
+# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+# endif
+#endif
+
+#ifndef BRG_UI16
+# define BRG_UI16
+# if USHRT_MAX == 65535u
+ typedef unsigned short uint_16t;
+# else
+# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+# endif
+#endif
+
+#ifndef BRG_UI32
+# define BRG_UI32
+# if UINT_MAX == 4294967295u
+# define li_32(h) 0x##h##u
+ typedef unsigned int uint_32t;
+# elif ULONG_MAX == 4294967295u
+# define li_32(h) 0x##h##ul
+ typedef unsigned long uint_32t;
+# elif defined( _CRAY )
+# error This code needs 32-bit data types, which Cray machines do not provide
+# else
+# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+# endif
+#endif
+
+#ifndef BRG_UI64
+# if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+# define BRG_UI64
+# define li_64(h) 0x##h##ui64
+ typedef unsigned __int64 uint_64t;
+# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */
+# define BRG_UI64
+# define li_64(h) 0x##h##ui64
+ typedef unsigned __int64 uint_64t;
+# elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+# define BRG_UI64
+# define li_64(h) 0x##h##ull
+ typedef unsigned long long uint_64t;
+# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+# if UINT_MAX == 18446744073709551615u
+# define BRG_UI64
+# define li_64(h) 0x##h##u
+ typedef unsigned int uint_64t;
+# endif
+# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+# if ULONG_MAX == 18446744073709551615ul
+# define BRG_UI64
+# define li_64(h) 0x##h##ul
+ typedef unsigned long uint_64t;
+# endif
+# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+# if ULLONG_MAX == 18446744073709551615ull
+# define BRG_UI64
+# define li_64(h) 0x##h##ull
+ typedef unsigned long long uint_64t;
+# endif
+# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+# if ULONG_LONG_MAX == 18446744073709551615ull
+# define BRG_UI64
+# define li_64(h) 0x##h##ull
+ typedef unsigned long long uint_64t;
+# endif
+# elif defined(__GNUC__) /* DLW: avoid mingw problem with -ansi */
+# define BRG_UI64
+# define li_64(h) 0x##h##ull
+ typedef unsigned long long uint_64t;
+# endif
+#endif
+
+#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 )
+# error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+# define RETURN_VALUES
+# if defined( DLL_EXPORT )
+# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+# define VOID_RETURN __declspec( dllexport ) void __stdcall
+# define INT_RETURN __declspec( dllexport ) int __stdcall
+# elif defined( __GNUC__ )
+# define VOID_RETURN __declspec( __dllexport__ ) void
+# define INT_RETURN __declspec( __dllexport__ ) int
+# else
+# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+# endif
+# elif defined( DLL_IMPORT )
+# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+# define VOID_RETURN __declspec( dllimport ) void __stdcall
+# define INT_RETURN __declspec( dllimport ) int __stdcall
+# elif defined( __GNUC__ )
+# define VOID_RETURN __declspec( __dllimport__ ) void
+# define INT_RETURN __declspec( __dllimport__ ) int
+# else
+# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+# endif
+# elif defined( __WATCOMC__ )
+# define VOID_RETURN void __cdecl
+# define INT_RETURN int __cdecl
+# else
+# define VOID_RETURN void
+# define INT_RETURN int
+# endif
+#endif
+
+/* These defines are used to declare buffers in a way that allows
+ faster operations on longer variables to be used. In all these
+ defines 'size' must be a power of 2 and >= 8
+
+ dec_unit_type(size,x) declares a variable 'x' of length
+ 'size' bits
+
+ dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize'
+ bytes defined as an array of variables
+ each of 'size' bits (bsize must be a
+ multiple of size / 8)
+
+ ptr_cast(x,size) casts a pointer to a pointer to a
+ varaiable of length 'size' bits
+*/
+
+#define ui_type(size) uint_##size##t
+#define dec_unit_type(size,x) typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size) ((ui_type(size)*)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/Optimized_32bit/skein.c b/Optimized_32bit/skein.c
new file mode 100644
index 000000000000..c9289cd49e8e
--- /dev/null
+++ b/Optimized_32bit/skein.c
@@ -0,0 +1,753 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+
+#include <string.h> /* get the memcpy/memset functions */
+#include "skein.h" /* get the Skein API definitions */
+#include "skein_iv.h" /* get precomputed IVs */
+
+/*****************************************************************/
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd);
+
+/*****************************************************************/
+/* 256-bit Skein */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation */
+int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+ {
+ union
+ {
+ u08b_t b[SKEIN_256_STATE_BYTES];
+ u64b_t w[SKEIN_256_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+
+ switch (hashBitLen)
+ { /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+ case 256: memcpy(ctx->X,SKEIN_256_IV_256,sizeof(ctx->X)); break;
+ case 224: memcpy(ctx->X,SKEIN_256_IV_224,sizeof(ctx->X)); break;
+ case 160: memcpy(ctx->X,SKEIN_256_IV_160,sizeof(ctx->X)); break;
+ case 128: memcpy(ctx->X,SKEIN_256_IV_128,sizeof(ctx->X)); break;
+#endif
+ default:
+ /* here if there is no precomputed IV value available */
+ /* build/process the config block, type == CONFIG (could be precomputed) */
+ Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+ memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+ /* compute the initial chaining values from config block */
+ memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */
+ Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+ break;
+ }
+ /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+ /* Set up to process the data message portion of the hash (default) */
+ Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+ {
+ union
+ {
+ u08b_t b[SKEIN_256_STATE_BYTES];
+ u64b_t w[SKEIN_256_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+ Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+ /* compute the initial chaining values ctx->X[], based on key */
+ if (keyBytes == 0) /* is there a key? */
+ {
+ memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */
+ }
+ else /* here to pre-process a key */
+ {
+ Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+ /* do a mini-Init right here */
+ ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */
+ Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */
+ memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */
+ Skein_256_Update(ctx,key,keyBytes); /* hash the key */
+ Skein_256_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */
+ memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+ {
+ uint_t i;
+ for (i=0;i<SKEIN_256_STATE_WORDS;i++) /* convert key bytes to context words */
+ ctx->X[i] = Skein_Swap64(ctx->X[i]);
+ }
+#endif
+ }
+ /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+ Skein_Start_New_Type(ctx,CFG_FINAL);
+
+ memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+ Skein_Show_Key(256,&ctx->h,key,keyBytes);
+
+ /* compute the initial chaining values from config block */
+ Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+ /* The chaining vars ctx->X are now initialized */
+ /* Set up to process the data message portion of the hash (default) */
+ ctx->h.bCnt = 0; /* buffer b[] starts out empty */
+ Skein_Start_New_Type(ctx,MSG);
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+ {
+ size_t n;
+
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ /* process full blocks, if any */
+ if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+ {
+ if (ctx->h.bCnt) /* finish up any buffered message data */
+ {
+ n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+ if (n)
+ {
+ Skein_assert(n < msgByteCnt); /* check on our logic here */
+ memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+ msgByteCnt -= n;
+ msg += n;
+ ctx->h.bCnt += n;
+ }
+ Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+ Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+ ctx->h.bCnt = 0;
+ }
+ /* now process any remaining full blocks, directly from input message data */
+ if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
+ {
+ n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */
+ Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+ msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+ msg += n * SKEIN_256_BLOCK_BYTES;
+ }
+ Skein_assert(ctx->h.bCnt == 0);
+ }
+
+ /* copy any remaining source message data bytes into b[] */
+ if (msgByteCnt)
+ {
+ Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+ memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+ ctx->h.bCnt += msgByteCnt;
+ }
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ size_t i,n,byteCnt;
+ u64b_t X[SKEIN_256_STATE_WORDS];
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */
+ memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+ Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
+
+ /* now output the result */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
+
+ /* run Threefish in "counter mode" to generate output */
+ memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
+ for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+ {
+ ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+ Skein_Start_New_Type(ctx,OUT_FINAL);
+ Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+ n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */
+ if (n >= SKEIN_256_BLOCK_BYTES)
+ n = SKEIN_256_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
+ }
+ return SKEIN_SUCCESS;
+ }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_API_CodeSize(void)
+ {
+ return ((u08b_t *) Skein_256_API_CodeSize) -
+ ((u08b_t *) Skein_256_Init);
+ }
+#endif
+
+/*****************************************************************/
+/* 512-bit Skein */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation */
+int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+ {
+ union
+ {
+ u08b_t b[SKEIN_512_STATE_BYTES];
+ u64b_t w[SKEIN_512_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+
+ switch (hashBitLen)
+ { /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+ case 512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); break;
+ case 384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X)); break;
+ case 256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X)); break;
+ case 224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X)); break;
+#endif
+ default:
+ /* here if there is no precomputed IV value available */
+ /* build/process the config block, type == CONFIG (could be precomputed) */
+ Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+ memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+ /* compute the initial chaining values from config block */
+ memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */
+ Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+ break;
+ }
+
+ /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+ /* Set up to process the data message portion of the hash (default) */
+ Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+ {
+ union
+ {
+ u08b_t b[SKEIN_512_STATE_BYTES];
+ u64b_t w[SKEIN_512_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+ Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+ /* compute the initial chaining values ctx->X[], based on key */
+ if (keyBytes == 0) /* is there a key? */
+ {
+ memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */
+ }
+ else /* here to pre-process a key */
+ {
+ Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+ /* do a mini-Init right here */
+ ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */
+ Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */
+ memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */
+ Skein_512_Update(ctx,key,keyBytes); /* hash the key */
+ Skein_512_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */
+ memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+ {
+ uint_t i;
+ for (i=0;i<SKEIN_512_STATE_WORDS;i++) /* convert key bytes to context words */
+ ctx->X[i] = Skein_Swap64(ctx->X[i]);
+ }
+#endif
+ }
+ /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+ Skein_Start_New_Type(ctx,CFG_FINAL);
+
+ memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+ Skein_Show_Key(512,&ctx->h,key,keyBytes);
+
+ /* compute the initial chaining values from config block */
+ Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+ /* The chaining vars ctx->X are now initialized */
+ /* Set up to process the data message portion of the hash (default) */
+ ctx->h.bCnt = 0; /* buffer b[] starts out empty */
+ Skein_Start_New_Type(ctx,MSG);
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+ {
+ size_t n;
+
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ /* process full blocks, if any */
+ if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+ {
+ if (ctx->h.bCnt) /* finish up any buffered message data */
+ {
+ n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+ if (n)
+ {
+ Skein_assert(n < msgByteCnt); /* check on our logic here */
+ memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+ msgByteCnt -= n;
+ msg += n;
+ ctx->h.bCnt += n;
+ }
+ Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+ Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+ ctx->h.bCnt = 0;
+ }
+ /* now process any remaining full blocks, directly from input message data */
+ if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+ {
+ n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */
+ Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+ msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+ msg += n * SKEIN_512_BLOCK_BYTES;
+ }
+ Skein_assert(ctx->h.bCnt == 0);
+ }
+
+ /* copy any remaining source message data bytes into b[] */
+ if (msgByteCnt)
+ {
+ Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+ memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+ ctx->h.bCnt += msgByteCnt;
+ }
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ size_t i,n,byteCnt;
+ u64b_t X[SKEIN_512_STATE_WORDS];
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */
+ memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+ Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
+
+ /* now output the result */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
+
+ /* run Threefish in "counter mode" to generate output */
+ memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
+ for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+ {
+ ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+ Skein_Start_New_Type(ctx,OUT_FINAL);
+ Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+ n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */
+ if (n >= SKEIN_512_BLOCK_BYTES)
+ n = SKEIN_512_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
+ }
+ return SKEIN_SUCCESS;
+ }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_API_CodeSize(void)
+ {
+ return ((u08b_t *) Skein_512_API_CodeSize) -
+ ((u08b_t *) Skein_512_Init);
+ }
+#endif
+
+/*****************************************************************/
+/* 1024-bit Skein */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation */
+int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+ {
+ union
+ {
+ u08b_t b[SKEIN1024_STATE_BYTES];
+ u64b_t w[SKEIN1024_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+
+ switch (hashBitLen)
+ { /* use pre-computed values, where available */
+#ifndef SKEIN_NO_PRECOMP
+ case 512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break;
+ case 384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break;
+ case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break;
+#endif
+ default:
+ /* here if there is no precomputed IV value available */
+ /* build/process the config block, type == CONFIG (could be precomputed) */
+ Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+ memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
+
+ /* compute the initial chaining values from config block */
+ memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */
+ Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+ break;
+ }
+
+ /* The chaining vars ctx->X are now initialized for the given hashBitLen. */
+ /* Set up to process the data message portion of the hash (default) */
+ Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a MAC and/or tree hash operation */
+/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */
+int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes)
+ {
+ union
+ {
+ u08b_t b[SKEIN1024_STATE_BYTES];
+ u64b_t w[SKEIN1024_STATE_WORDS];
+ } cfg; /* config block */
+
+ Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
+ Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL);
+
+ /* compute the initial chaining values ctx->X[], based on key */
+ if (keyBytes == 0) /* is there a key? */
+ {
+ memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */
+ }
+ else /* here to pre-process a key */
+ {
+ Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X));
+ /* do a mini-Init right here */
+ ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */
+ Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */
+ memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */
+ Skein1024_Update(ctx,key,keyBytes); /* hash the key */
+ Skein1024_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */
+ memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */
+#if SKEIN_NEED_SWAP
+ {
+ uint_t i;
+ for (i=0;i<SKEIN1024_STATE_WORDS;i++) /* convert key bytes to context words */
+ ctx->X[i] = Skein_Swap64(ctx->X[i]);
+ }
+#endif
+ }
+ /* build/process the config block, type == CONFIG (could be precomputed for each key) */
+ ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
+ Skein_Start_New_Type(ctx,CFG_FINAL);
+
+ memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */
+ cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+ cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */
+ cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+
+ Skein_Show_Key(1024,&ctx->h,key,keyBytes);
+
+ /* compute the initial chaining values from config block */
+ Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
+
+ /* The chaining vars ctx->X are now initialized */
+ /* Set up to process the data message portion of the hash (default) */
+ ctx->h.bCnt = 0; /* buffer b[] starts out empty */
+ Skein_Start_New_Type(ctx,MSG);
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
+ {
+ size_t n;
+
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ /* process full blocks, if any */
+ if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+ {
+ if (ctx->h.bCnt) /* finish up any buffered message data */
+ {
+ n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+ if (n)
+ {
+ Skein_assert(n < msgByteCnt); /* check on our logic here */
+ memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+ msgByteCnt -= n;
+ msg += n;
+ ctx->h.bCnt += n;
+ }
+ Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+ Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+ ctx->h.bCnt = 0;
+ }
+ /* now process any remaining full blocks, directly from input message data */
+ if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
+ {
+ n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */
+ Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+ msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+ msg += n * SKEIN1024_BLOCK_BYTES;
+ }
+ Skein_assert(ctx->h.bCnt == 0);
+ }
+
+ /* copy any remaining source message data bytes into b[] */
+ if (msgByteCnt)
+ {
+ Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+ memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+ ctx->h.bCnt += msgByteCnt;
+ }
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ size_t i,n,byteCnt;
+ u64b_t X[SKEIN1024_STATE_WORDS];
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */
+ memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+ Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
+
+ /* now output the result */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
+
+ /* run Threefish in "counter mode" to generate output */
+ memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
+ for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+ {
+ ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+ Skein_Start_New_Type(ctx,OUT_FINAL);
+ Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+ n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */
+ if (n >= SKEIN1024_BLOCK_BYTES)
+ n = SKEIN1024_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
+ }
+ return SKEIN_SUCCESS;
+ }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_API_CodeSize(void)
+ {
+ return ((u08b_t *) Skein1024_API_CodeSize) -
+ ((u08b_t *) Skein1024_Init);
+ }
+#endif
+
+/**************** Functions to support MAC/tree hashing ***************/
+/* (this code is identical for Optimized and Reference versions) */
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */
+ memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+ Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
+
+ Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES); /* "output" the state bytes */
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */
+ memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+ Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
+
+ Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES); /* "output" the state bytes */
+
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */
+ if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */
+ memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+ Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */
+
+ Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES); /* "output" the state bytes */
+
+ return SKEIN_SUCCESS;
+ }
+
+#if SKEIN_TREE_HASH
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage */
+int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ size_t i,n,byteCnt;
+ u64b_t X[SKEIN_256_STATE_WORDS];
+ Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ /* now output the result */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
+
+ /* run Threefish in "counter mode" to generate output */
+ memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
+ for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
+ {
+ ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+ Skein_Start_New_Type(ctx,OUT_FINAL);
+ Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+ n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */
+ if (n >= SKEIN_256_BLOCK_BYTES)
+ n = SKEIN_256_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
+ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
+ }
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage */
+int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ size_t i,n,byteCnt;
+ u64b_t X[SKEIN_512_STATE_WORDS];
+ Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ /* now output the result */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
+
+ /* run Threefish in "counter mode" to generate output */
+ memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
+ for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+ {
+ ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+ Skein_Start_New_Type(ctx,OUT_FINAL);
+ Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+ n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */
+ if (n >= SKEIN_512_BLOCK_BYTES)
+ n = SKEIN_512_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
+ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
+ }
+ return SKEIN_SUCCESS;
+ }
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* just do the OUTPUT stage */
+int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
+ {
+ size_t i,n,byteCnt;
+ u64b_t X[SKEIN1024_STATE_WORDS];
+ Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */
+
+ /* now output the result */
+ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
+
+ /* run Threefish in "counter mode" to generate output */
+ memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
+ for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
+ {
+ ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+ Skein_Start_New_Type(ctx,OUT_FINAL);
+ Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+ n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */
+ if (n >= SKEIN1024_BLOCK_BYTES)
+ n = SKEIN1024_BLOCK_BYTES;
+ Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
+ Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
+ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
+ }
+ return SKEIN_SUCCESS;
+ }
+#endif
diff --git a/Optimized_32bit/skein.h b/Optimized_32bit/skein.h
new file mode 100644
index 000000000000..721c9bc9ce0d
--- /dev/null
+++ b/Optimized_32bit/skein.h
@@ -0,0 +1,327 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_ 1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+**
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+** SKEIN_DEBUG -- make callouts from inside Skein code
+** to examine/display intermediate values.
+** [default: no callouts (no overhead)]
+**
+** SKEIN_ERR_CHECK -- how error checking is handled inside Skein
+** code. If not defined, most error checking
+** is disabled (for performance). Otherwise,
+** the switch value is interpreted as:
+** 0: use assert() to flag errors
+** 1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stddef.h> /* get size_t definition */
+#include "skein_port.h" /* get platform-specific definitions */
+
+enum
+ {
+ SKEIN_SUCCESS = 0, /* return codes from Skein calls */
+ SKEIN_FAIL = 1,
+ SKEIN_BAD_HASHLEN = 2
+ };
+
+#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */
+
+#define SKEIN_256_STATE_WORDS ( 4)
+#define SKEIN_512_STATE_WORDS ( 8)
+#define SKEIN1024_STATE_WORDS (16)
+#define SKEIN_MAX_STATE_WORDS (16)
+
+#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
+
+#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+
+typedef struct
+ {
+ size_t hashBitLen; /* size of hash result, in bits */
+ size_t bCnt; /* current byte count in buffer b[] */
+ u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
+ } Skein_Ctxt_Hdr_t;
+
+typedef struct /* 256-bit Skein hash context structure */
+ {
+ Skein_Ctxt_Hdr_t h; /* common header context variables */
+ u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */
+ u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+ } Skein_256_Ctxt_t;
+
+typedef struct /* 512-bit Skein hash context structure */
+ {
+ Skein_Ctxt_Hdr_t h; /* common header context variables */
+ u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */
+ u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+ } Skein_512_Ctxt_t;
+
+typedef struct /* 1024-bit Skein hash context structure */
+ {
+ Skein_Ctxt_Hdr_t h; /* common header context variables */
+ u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */
+ u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+ } Skein1024_Ctxt_t;
+
+/* Skein APIs for (incremental) "straight hashing" */
+int Skein_256_Init (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+int Skein_512_Init (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+int Skein1024_Init (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+
+int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+
+int Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+/*
+** Skein APIs for "extended" initialization: MAC keys, tree hashing.
+** After an InitExt() call, just use Update/Final calls as with Init().
+**
+** Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
+** When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL,
+** the results of InitExt() are identical to calling Init().
+** The function Init() may be called once to "precompute" the IV for
+** a given hashBitLen value, then by saving a copy of the context
+** the IV computation may be avoided in later calls.
+** Similarly, the function InitExt() may be called once per MAC key
+** to precompute the MAC IV, then a copy of the context saved and
+** reused for each new MAC computation.
+**/
+int Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+int Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes);
+
+/*
+** Skein APIs for MAC and tree hash:
+** Final_Pad: pad, do final block, but no OUTPUT type
+** Output: do just the output stage
+*/
+int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+
+#ifndef SKEIN_TREE_HASH
+#define SKEIN_TREE_HASH (1)
+#endif
+#if SKEIN_TREE_HASH
+int Skein_256_Output (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
+int Skein_512_Output (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+#endif
+
+/*****************************************************************
+** "Internal" Skein definitions
+** -- not needed for sequential hashing API, but will be
+** helpful for other uses of Skein (e.g., tree hash mode).
+** -- included here so that they can be shared between
+** reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */
+
+#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */
+#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */
+#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */
+#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */
+
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t) 1 ) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_KEY ( 0) /* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG ( 4) /* configuration block */
+#define SKEIN_BLK_TYPE_PERS ( 8) /* personalization string */
+#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG (48) /* message processing */
+#define SKEIN_BLK_TYPE_OUT (63) /* output stage */
+#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION (1)
+
+#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN (4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS ( 0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS ( 8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16)
+
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl) \
+ ( (((u64b_t)(leaf )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \
+ (((u64b_t)(node )) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \
+ (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+
+/*
+** Skein macros for getting/setting tweak words, etc.
+** These are useful for partial input bytes, hash tree init/update, etc.
+**/
+#define Skein_Get_Tweak(ctxPtr,TWK_NUM) ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr,0)
+#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr,1)
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1) \
+ { \
+ Skein_Set_T0(ctxPtr,(T0)); \
+ Skein_Set_T1(ctxPtr,(T1)); \
+ }
+
+#define Skein_Set_Type(ctxPtr,BLK_TYPE) \
+ Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \
+ { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Clear_First_Flag(hdr) { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; }
+#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; }
+
+#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+
+/*****************************************************************
+** "Internal" Skein definitions for debugging and error checking
+******************************************************************/
+#ifdef SKEIN_DEBUG /* examine/display intermediate values? */
+#include "skein_debug.h"
+#else /* default is no callouts */
+#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
+#define Skein_Show_Round(bits,ctx,r,X)
+#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
+#define Skein_Show_Final(bits,ctx,cnt,outPtr)
+#define Skein_Show_Key(bits,ctx,key,keyBytes)
+#endif
+
+#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_assert(x)
+#elif defined(SKEIN_ASSERT)
+#include <assert.h>
+#define Skein_Assert(x,retCode) assert(x)
+#define Skein_assert(x) assert(x)
+#else
+#include <assert.h>
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */
+#define Skein_assert(x) assert(x) /* internal error */
+#endif
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum
+ {
+ /* Skein_256 round rotation constants */
+ R_256_0_0=14, R_256_0_1=16,
+ R_256_1_0=52, R_256_1_1=57,
+ R_256_2_0=23, R_256_2_1=40,
+ R_256_3_0= 5, R_256_3_1=37,
+ R_256_4_0=25, R_256_4_1=33,
+ R_256_5_0=46, R_256_5_1=12,
+ R_256_6_0=58, R_256_6_1=22,
+ R_256_7_0=32, R_256_7_1=32,
+
+ /* Skein_512 round rotation constants */
+ R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+ R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+ R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+ R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+ R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+ R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+ R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+ R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+
+ /* Skein1024 round rotation constants */
+ R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
+ R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
+ R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
+ R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
+ R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
+ R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
+ R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
+ R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+ };
+
+#ifndef SKEIN_ROUNDS
+#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */
+#define SKEIN_512_ROUNDS_TOTAL (72)
+#define SKEIN1024_ROUNDS_TOTAL (80)
+#else /* allow command-line define in range 8*(5..14) */
+#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS ) + 5) % 10) + 5))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ifndef _SKEIN_H_ */
diff --git a/Optimized_32bit/skein_block.c b/Optimized_32bit/skein_block.c
new file mode 100644
index 000000000000..bfd29d1eee2d
--- /dev/null
+++ b/Optimized_32bit/skein_block.c
@@ -0,0 +1,689 @@
+/***********************************************************************
+**
+** Implementation of the Skein block functions.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Compile-time switches:
+**
+** SKEIN_USE_ASM -- set bits (256/512/1024) to select which
+** versions use ASM code for block processing
+** [default: use C for all block sizes]
+**
+************************************************************************/
+
+#include <string.h>
+#include "skein.h"
+
+#ifndef SKEIN_USE_ASM
+#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
+#endif
+
+#ifndef SKEIN_LOOP
+#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
+#endif
+
+#define BLK_BITS (WCNT*64) /* some useful definitions for code here */
+#define KW_TWK_BASE (0)
+#define KW_KEY_BASE (3)
+#define ks (kw + KW_KEY_BASE)
+#define ts (kw + KW_TWK_BASE)
+
+#ifdef SKEIN_DEBUG
+#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#else
+#define DebugSaveTweak(ctx)
+#endif
+
+/***************************** Skein_256 ******************************/
+#if !(SKEIN_USE_ASM & 256)
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+ { /* do it in C */
+ enum
+ {
+ WCNT = SKEIN_256_STATE_WORDS
+ };
+#undef RCNT
+#define RCNT (SKEIN_256_ROUNDS_TOTAL/8)
+
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
+#else
+#define SKEIN_UNROLL_256 (0)
+#endif
+
+#if SKEIN_UNROLL_256
+#if (RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
+#endif
+ size_t r;
+ u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+ u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
+#endif
+ u64b_t X0,X1,X2,X3; /* local copy of context vars, for speed */
+ u64b_t w [WCNT]; /* local copy of input block */
+#ifdef SKEIN_DEBUG
+ const u64b_t *Xptr[4]; /* use for debugging (help compiler put Xn in registers) */
+ Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3;
+#endif
+ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
+ ts[0] = ctx->h.T[0];
+ ts[1] = ctx->h.T[1];
+ do {
+ /* this implementation only supports 2**64 input bytes (no carry out here) */
+ ts[0] += byteCntAdd; /* update processed length */
+
+ /* precompute the key schedule for this block */
+ ks[0] = ctx->X[0];
+ ks[1] = ctx->X[1];
+ ks[2] = ctx->X[2];
+ ks[3] = ctx->X[3];
+ ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+ ts[2] = ts[0] ^ ts[1];
+
+ Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+ DebugSaveTweak(ctx);
+ Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+ X0 = w[0] + ks[0]; /* do the first full key injection */
+ X1 = w[1] + ks[1] + ts[0];
+ X2 = w[2] + ks[2] + ts[1];
+ X3 = w[3] + ks[3];
+
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); /* show starting state values */
+
+ blkPtr += SKEIN_256_BLOCK_BYTES;
+
+ /* run the rounds */
+
+#define Round256(p0,p1,p2,p3,ROT,rNum) \
+ X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+ X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+
+#if SKEIN_UNROLL_256 == 0
+#define R256(p0,p1,p2,p3,ROT,rNum) /* fully unrolled */ \
+ Round256(p0,p1,p2,p3,ROT,rNum) \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I256(R) \
+ X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \
+ X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \
+ X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \
+ X3 += ks[((R)+4) % 5] + (R)+1; \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else /* looping version */
+#define R256(p0,p1,p2,p3,ROT,rNum) \
+ Round256(p0,p1,p2,p3,ROT,rNum) \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I256(R) \
+ X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
+ X1 += ks[r+(R)+1] + ts[r+(R)+0]; \
+ X2 += ks[r+(R)+2] + ts[r+(R)+1]; \
+ X3 += ks[r+(R)+3] + r+(R) ; \
+ ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\
+ ts[r + (R)+2 ] = ts[r+(R)-1]; \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+ for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256) /* loop thru it */
+#endif
+ {
+#define R256_8_rounds(R) \
+ R256(0,1,2,3,R_256_0,8*(R) + 1); \
+ R256(0,3,2,1,R_256_1,8*(R) + 2); \
+ R256(0,1,2,3,R_256_2,8*(R) + 3); \
+ R256(0,3,2,1,R_256_3,8*(R) + 4); \
+ I256(2*(R)); \
+ R256(0,1,2,3,R_256_4,8*(R) + 5); \
+ R256(0,3,2,1,R_256_5,8*(R) + 6); \
+ R256(0,1,2,3,R_256_6,8*(R) + 7); \
+ R256(0,3,2,1,R_256_7,8*(R) + 8); \
+ I256(2*(R)+1);
+
+ R256_8_rounds( 0);
+
+#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
+
+ #if R256_Unroll_R( 1)
+ R256_8_rounds( 1);
+ #endif
+ #if R256_Unroll_R( 2)
+ R256_8_rounds( 2);
+ #endif
+ #if R256_Unroll_R( 3)
+ R256_8_rounds( 3);
+ #endif
+ #if R256_Unroll_R( 4)
+ R256_8_rounds( 4);
+ #endif
+ #if R256_Unroll_R( 5)
+ R256_8_rounds( 5);
+ #endif
+ #if R256_Unroll_R( 6)
+ R256_8_rounds( 6);
+ #endif
+ #if R256_Unroll_R( 7)
+ R256_8_rounds( 7);
+ #endif
+ #if R256_Unroll_R( 8)
+ R256_8_rounds( 8);
+ #endif
+ #if R256_Unroll_R( 9)
+ R256_8_rounds( 9);
+ #endif
+ #if R256_Unroll_R(10)
+ R256_8_rounds(10);
+ #endif
+ #if R256_Unroll_R(11)
+ R256_8_rounds(11);
+ #endif
+ #if R256_Unroll_R(12)
+ R256_8_rounds(12);
+ #endif
+ #if R256_Unroll_R(13)
+ R256_8_rounds(13);
+ #endif
+ #if R256_Unroll_R(14)
+ R256_8_rounds(14);
+ #endif
+ #if (SKEIN_UNROLL_256 > 14)
+#error "need more unrolling in Skein_256_Process_Block"
+ #endif
+ }
+ /* do the final "feedforward" xor, update context chaining vars */
+ ctx->X[0] = X0 ^ w[0];
+ ctx->X[1] = X1 ^ w[1];
+ ctx->X[2] = X2 ^ w[2];
+ ctx->X[3] = X3 ^ w[3];
+
+ Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+ ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+ }
+ while (--blkCnt);
+ ctx->h.T[0] = ts[0];
+ ctx->h.T[1] = ts[1];
+ }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_256_Process_Block_CodeSize(void)
+ {
+ return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
+ ((u08b_t *) Skein_256_Process_Block);
+ }
+uint_t Skein_256_Unroll_Cnt(void)
+ {
+ return SKEIN_UNROLL_256;
+ }
+#endif
+#endif
+
+/***************************** Skein_512 ******************************/
+#if !(SKEIN_USE_ASM & 512)
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+ { /* do it in C */
+ enum
+ {
+ WCNT = SKEIN_512_STATE_WORDS
+ };
+#undef RCNT
+#define RCNT (SKEIN_512_ROUNDS_TOTAL/8)
+
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#else
+#define SKEIN_UNROLL_512 (0)
+#endif
+
+#if SKEIN_UNROLL_512
+#if (RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
+#endif
+ size_t r;
+ u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+ u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
+#endif
+ u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */
+ u64b_t w [WCNT]; /* local copy of input block */
+#ifdef SKEIN_DEBUG
+ const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */
+ Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3;
+ Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7;
+#endif
+
+ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
+ ts[0] = ctx->h.T[0];
+ ts[1] = ctx->h.T[1];
+ do {
+ /* this implementation only supports 2**64 input bytes (no carry out here) */
+ ts[0] += byteCntAdd; /* update processed length */
+
+ /* precompute the key schedule for this block */
+ ks[0] = ctx->X[0];
+ ks[1] = ctx->X[1];
+ ks[2] = ctx->X[2];
+ ks[3] = ctx->X[3];
+ ks[4] = ctx->X[4];
+ ks[5] = ctx->X[5];
+ ks[6] = ctx->X[6];
+ ks[7] = ctx->X[7];
+ ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+ ts[2] = ts[0] ^ ts[1];
+
+ Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+ DebugSaveTweak(ctx);
+ Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+ X0 = w[0] + ks[0]; /* do the first full key injection */
+ X1 = w[1] + ks[1];
+ X2 = w[2] + ks[2];
+ X3 = w[3] + ks[3];
+ X4 = w[4] + ks[4];
+ X5 = w[5] + ks[5] + ts[0];
+ X6 = w[6] + ks[6] + ts[1];
+ X7 = w[7] + ks[7];
+
+ blkPtr += SKEIN_512_BLOCK_BYTES;
+
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+ /* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
+ X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+ X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+ X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+ X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#if SKEIN_UNROLL_512 == 0
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \
+ Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
+
+#define I512(R) \
+ X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \
+ X1 += ks[((R)+2) % 9]; \
+ X2 += ks[((R)+3) % 9]; \
+ X3 += ks[((R)+4) % 9]; \
+ X4 += ks[((R)+5) % 9]; \
+ X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
+ X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
+ X7 += ks[((R)+8) % 9] + (R)+1; \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else /* looping version */
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
+ Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
+
+#define I512(R) \
+ X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
+ X1 += ks[r+(R)+1]; \
+ X2 += ks[r+(R)+2]; \
+ X3 += ks[r+(R)+3]; \
+ X4 += ks[r+(R)+4]; \
+ X5 += ks[r+(R)+5] + ts[r+(R)+0]; \
+ X6 += ks[r+(R)+6] + ts[r+(R)+1]; \
+ X7 += ks[r+(R)+7] + r+(R) ; \
+ ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \
+ ts[r + (R)+2] = ts[r+(R)-1]; \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+ for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */
+#endif /* end of looped code definitions */
+ {
+#define R512_8_rounds(R) /* do 8 full rounds */ \
+ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
+ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
+ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
+ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
+ I512(2*(R)); \
+ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
+ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
+ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
+ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
+ I512(2*(R)+1); /* and key injection */
+
+ R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+ #if R512_Unroll_R( 1)
+ R512_8_rounds( 1);
+ #endif
+ #if R512_Unroll_R( 2)
+ R512_8_rounds( 2);
+ #endif
+ #if R512_Unroll_R( 3)
+ R512_8_rounds( 3);
+ #endif
+ #if R512_Unroll_R( 4)
+ R512_8_rounds( 4);
+ #endif
+ #if R512_Unroll_R( 5)
+ R512_8_rounds( 5);
+ #endif
+ #if R512_Unroll_R( 6)
+ R512_8_rounds( 6);
+ #endif
+ #if R512_Unroll_R( 7)
+ R512_8_rounds( 7);
+ #endif
+ #if R512_Unroll_R( 8)
+ R512_8_rounds( 8);
+ #endif
+ #if R512_Unroll_R( 9)
+ R512_8_rounds( 9);
+ #endif
+ #if R512_Unroll_R(10)
+ R512_8_rounds(10);
+ #endif
+ #if R512_Unroll_R(11)
+ R512_8_rounds(11);
+ #endif
+ #if R512_Unroll_R(12)
+ R512_8_rounds(12);
+ #endif
+ #if R512_Unroll_R(13)
+ R512_8_rounds(13);
+ #endif
+ #if R512_Unroll_R(14)
+ R512_8_rounds(14);
+ #endif
+ #if (SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+ #endif
+ }
+
+ /* do the final "feedforward" xor, update context chaining vars */
+ ctx->X[0] = X0 ^ w[0];
+ ctx->X[1] = X1 ^ w[1];
+ ctx->X[2] = X2 ^ w[2];
+ ctx->X[3] = X3 ^ w[3];
+ ctx->X[4] = X4 ^ w[4];
+ ctx->X[5] = X5 ^ w[5];
+ ctx->X[6] = X6 ^ w[6];
+ ctx->X[7] = X7 ^ w[7];
+ Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+ ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+ }
+ while (--blkCnt);
+ ctx->h.T[0] = ts[0];
+ ctx->h.T[1] = ts[1];
+ }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein_512_Process_Block_CodeSize(void)
+ {
+ return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
+ ((u08b_t *) Skein_512_Process_Block);
+ }
+uint_t Skein_512_Unroll_Cnt(void)
+ {
+ return SKEIN_UNROLL_512;
+ }
+#endif
+#endif
+
+/***************************** Skein1024 ******************************/
+#if !(SKEIN_USE_ASM & 1024)
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
+ { /* do it in C, always looping (unrolled is bigger AND slower!) */
+ enum
+ {
+ WCNT = SKEIN1024_STATE_WORDS
+ };
+#undef RCNT
+#define RCNT (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define SKEIN_UNROLL_1024 (0)
+#endif
+
+#if (SKEIN_UNROLL_1024 != 0)
+#if (RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */
+#endif
+ size_t r;
+ u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
+#else
+ u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
+#endif
+
+ u64b_t X00,X01,X02,X03,X04,X05,X06,X07, /* local copy of vars, for speed */
+ X08,X09,X10,X11,X12,X13,X14,X15;
+ u64b_t w [WCNT]; /* local copy of input block */
+#ifdef SKEIN_DEBUG
+ const u64b_t *Xptr[16]; /* use for debugging (help compiler put Xn in registers) */
+ Xptr[ 0] = &X00; Xptr[ 1] = &X01; Xptr[ 2] = &X02; Xptr[ 3] = &X03;
+ Xptr[ 4] = &X04; Xptr[ 5] = &X05; Xptr[ 6] = &X06; Xptr[ 7] = &X07;
+ Xptr[ 8] = &X08; Xptr[ 9] = &X09; Xptr[10] = &X10; Xptr[11] = &X11;
+ Xptr[12] = &X12; Xptr[13] = &X13; Xptr[14] = &X14; Xptr[15] = &X15;
+#endif
+
+ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
+ ts[0] = ctx->h.T[0];
+ ts[1] = ctx->h.T[1];
+ do {
+ /* this implementation only supports 2**64 input bytes (no carry out here) */
+ ts[0] += byteCntAdd; /* update processed length */
+
+ /* precompute the key schedule for this block */
+ ks[ 0] = ctx->X[ 0];
+ ks[ 1] = ctx->X[ 1];
+ ks[ 2] = ctx->X[ 2];
+ ks[ 3] = ctx->X[ 3];
+ ks[ 4] = ctx->X[ 4];
+ ks[ 5] = ctx->X[ 5];
+ ks[ 6] = ctx->X[ 6];
+ ks[ 7] = ctx->X[ 7];
+ ks[ 8] = ctx->X[ 8];
+ ks[ 9] = ctx->X[ 9];
+ ks[10] = ctx->X[10];
+ ks[11] = ctx->X[11];
+ ks[12] = ctx->X[12];
+ ks[13] = ctx->X[13];
+ ks[14] = ctx->X[14];
+ ks[15] = ctx->X[15];
+ ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^
+ ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^
+ ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^
+ ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+ ts[2] = ts[0] ^ ts[1];
+
+ Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+ DebugSaveTweak(ctx);
+ Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+
+ X00 = w[ 0] + ks[ 0]; /* do the first full key injection */
+ X01 = w[ 1] + ks[ 1];
+ X02 = w[ 2] + ks[ 2];
+ X03 = w[ 3] + ks[ 3];
+ X04 = w[ 4] + ks[ 4];
+ X05 = w[ 5] + ks[ 5];
+ X06 = w[ 6] + ks[ 6];
+ X07 = w[ 7] + ks[ 7];
+ X08 = w[ 8] + ks[ 8];
+ X09 = w[ 9] + ks[ 9];
+ X10 = w[10] + ks[10];
+ X11 = w[11] + ks[11];
+ X12 = w[12] + ks[12];
+ X13 = w[13] + ks[13] + ts[0];
+ X14 = w[14] + ks[14] + ts[1];
+ X15 = w[15] + ks[15];
+
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+
+#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \
+ X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+ X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+ X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+ X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+ X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8; \
+ X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA; \
+ X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC; \
+ X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE; \
+
+#if SKEIN_UNROLL_1024 == 0
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+ Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
+
+#define I1024(R) \
+ X00 += ks[((R)+ 1) % 17]; /* inject the key schedule value */ \
+ X01 += ks[((R)+ 2) % 17]; \
+ X02 += ks[((R)+ 3) % 17]; \
+ X03 += ks[((R)+ 4) % 17]; \
+ X04 += ks[((R)+ 5) % 17]; \
+ X05 += ks[((R)+ 6) % 17]; \
+ X06 += ks[((R)+ 7) % 17]; \
+ X07 += ks[((R)+ 8) % 17]; \
+ X08 += ks[((R)+ 9) % 17]; \
+ X09 += ks[((R)+10) % 17]; \
+ X10 += ks[((R)+11) % 17]; \
+ X11 += ks[((R)+12) % 17]; \
+ X12 += ks[((R)+13) % 17]; \
+ X13 += ks[((R)+14) % 17] + ts[((R)+1) % 3]; \
+ X14 += ks[((R)+15) % 17] + ts[((R)+2) % 3]; \
+ X15 += ks[((R)+16) % 17] + (R)+1; \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+#else /* looping version */
+#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+ Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr);
+
+#define I1024(R) \
+ X00 += ks[r+(R)+ 0]; /* inject the key schedule value */ \
+ X01 += ks[r+(R)+ 1]; \
+ X02 += ks[r+(R)+ 2]; \
+ X03 += ks[r+(R)+ 3]; \
+ X04 += ks[r+(R)+ 4]; \
+ X05 += ks[r+(R)+ 5]; \
+ X06 += ks[r+(R)+ 6]; \
+ X07 += ks[r+(R)+ 7]; \
+ X08 += ks[r+(R)+ 8]; \
+ X09 += ks[r+(R)+ 9]; \
+ X10 += ks[r+(R)+10]; \
+ X11 += ks[r+(R)+11]; \
+ X12 += ks[r+(R)+12]; \
+ X13 += ks[r+(R)+13] + ts[r+(R)+0]; \
+ X14 += ks[r+(R)+14] + ts[r+(R)+1]; \
+ X15 += ks[r+(R)+15] + r+(R) ; \
+ ks[r + (R)+16] = ks[r+(R)-1]; /* rotate key schedule */ \
+ ts[r + (R)+ 2] = ts[r+(R)-1]; \
+ Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
+
+ for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024) /* loop thru it */
+#endif
+ {
+#define R1024_8_rounds(R) /* do 8 full rounds */ \
+ R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
+ R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \
+ R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \
+ R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \
+ I1024(2*(R)); \
+ R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \
+ R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \
+ R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \
+ R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \
+ I1024(2*(R)+1);
+
+ R1024_8_rounds( 0);
+
+#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
+
+ #if R1024_Unroll_R( 1)
+ R1024_8_rounds( 1);
+ #endif
+ #if R1024_Unroll_R( 2)
+ R1024_8_rounds( 2);
+ #endif
+ #if R1024_Unroll_R( 3)
+ R1024_8_rounds( 3);
+ #endif
+ #if R1024_Unroll_R( 4)
+ R1024_8_rounds( 4);
+ #endif
+ #if R1024_Unroll_R( 5)
+ R1024_8_rounds( 5);
+ #endif
+ #if R1024_Unroll_R( 6)
+ R1024_8_rounds( 6);
+ #endif
+ #if R1024_Unroll_R( 7)
+ R1024_8_rounds( 7);
+ #endif
+ #if R1024_Unroll_R( 8)
+ R1024_8_rounds( 8);
+ #endif
+ #if R1024_Unroll_R( 9)
+ R1024_8_rounds( 9);
+ #endif
+ #if R1024_Unroll_R(10)
+ R1024_8_rounds(10);
+ #endif
+ #if R1024_Unroll_R(11)
+ R1024_8_rounds(11);
+ #endif
+ #if R1024_Unroll_R(12)
+ R1024_8_rounds(12);
+ #endif
+ #if R1024_Unroll_R(13)
+ R1024_8_rounds(13);
+ #endif
+ #if R1024_Unroll_R(14)
+ R1024_8_rounds(14);
+ #endif
+ #if (SKEIN_UNROLL_1024 > 14)
+#error "need more unrolling in Skein_1024_Process_Block"
+ #endif
+ }
+ /* do the final "feedforward" xor, update context chaining vars */
+
+ ctx->X[ 0] = X00 ^ w[ 0];
+ ctx->X[ 1] = X01 ^ w[ 1];
+ ctx->X[ 2] = X02 ^ w[ 2];
+ ctx->X[ 3] = X03 ^ w[ 3];
+ ctx->X[ 4] = X04 ^ w[ 4];
+ ctx->X[ 5] = X05 ^ w[ 5];
+ ctx->X[ 6] = X06 ^ w[ 6];
+ ctx->X[ 7] = X07 ^ w[ 7];
+ ctx->X[ 8] = X08 ^ w[ 8];
+ ctx->X[ 9] = X09 ^ w[ 9];
+ ctx->X[10] = X10 ^ w[10];
+ ctx->X[11] = X11 ^ w[11];
+ ctx->X[12] = X12 ^ w[12];
+ ctx->X[13] = X13 ^ w[13];
+ ctx->X[14] = X14 ^ w[14];
+ ctx->X[15] = X15 ^ w[15];
+
+ Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+
+ ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+ blkPtr += SKEIN1024_BLOCK_BYTES;
+ }
+ while (--blkCnt);
+ ctx->h.T[0] = ts[0];
+ ctx->h.T[1] = ts[1];
+ }
+
+#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t Skein1024_Process_Block_CodeSize(void)
+ {
+ return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
+ ((u08b_t *) Skein1024_Process_Block);
+ }
+uint_t Skein1024_Unroll_Cnt(void)
+ {
+ return SKEIN_UNROLL_1024;
+ }
+#endif
+#endif
diff --git a/Optimized_32bit/skein_debug.c b/Optimized_32bit/skein_debug.c
new file mode 100644
index 000000000000..fac5038598ea
--- /dev/null
+++ b/Optimized_32bit/skein_debug.c
@@ -0,0 +1,247 @@
+/***********************************************************************
+**
+** Debug output functions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+#include <stdio.h>
+
+#ifdef SKEIN_DEBUG /* only instantiate this code if SKEIN_DEBUG is on */
+#include "skein.h"
+
+static const char INDENT[] = " "; /* how much to indent on new line */
+
+uint_t skein_DebugFlag = 0; /* off by default. Must be set externally */
+
+static void Show64_step(size_t cnt,const u64b_t *X,size_t step)
+ {
+ size_t i,j;
+ for (i=j=0;i < cnt;i++,j+=step)
+ {
+ if (i % 4 == 0) printf(INDENT);
+ printf(" %08X.%08X ",(uint_32t)(X[j] >> 32),(uint_32t)X[j]);
+ if (i % 4 == 3 || i==cnt-1) printf("\n");
+ fflush(stdout);
+ }
+ }
+
+#define Show64(cnt,X) Show64_step(cnt,X,1)
+
+static void Show64_flag(size_t cnt,const u64b_t *X)
+ {
+ size_t xptr = (size_t) X;
+ size_t step = (xptr & 1) ? 2 : 1;
+ if (step != 1)
+ {
+ X = (const u64b_t *) (xptr & ~1);
+ }
+ Show64_step(cnt,X,step);
+ }
+
+static void Show08(size_t cnt,const u08b_t *b)
+ {
+ size_t i;
+ for (i=0;i < cnt;i++)
+ {
+ if (i %16 == 0) printf(INDENT);
+ else if (i % 4 == 0) printf(" ");
+ printf(" %02X",b[i]);
+ if (i %16 == 15 || i==cnt-1) printf("\n");
+ fflush(stdout);
+ }
+ }
+
+static const char *AlgoHeader(uint_t bits)
+ {
+ if (skein_DebugFlag & SKEIN_DEBUG_THREEFISH)
+ switch (bits)
+ {
+ case 256: return ":Threefish-256: ";
+ case 512: return ":Threefish-512: ";
+ case 1024: return ":Threefish-1024:";
+ }
+ else
+ switch (bits)
+ {
+ case 256: return ":Skein-256: ";
+ case 512: return ":Skein-512: ";
+ case 1024: return ":Skein-1024:";
+ }
+ return NULL;
+ }
+
+void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr)
+ {
+ if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+ if (skein_DebugFlag & SKEIN_DEBUG_FINAL)
+ {
+ printf("\n%s Final output=\n",AlgoHeader(bits));
+ Show08(cnt,outPtr);
+ printf(" ++++++++++\n");
+ fflush(stdout);
+ }
+ }
+
+/* show state after a round (or "pseudo-round") */
+void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X)
+ {
+ static uint_t injectNum=0; /* not multi-thread safe! */
+
+ if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+ if (skein_DebugFlag)
+ {
+ if (r >= SKEIN_RND_SPECIAL)
+ { /* a key injection (or feedforward) point */
+ injectNum = (r == SKEIN_RND_KEY_INITIAL) ? 0 : injectNum+1;
+ if ( skein_DebugFlag & SKEIN_DEBUG_INJECT ||
+ ((skein_DebugFlag & SKEIN_DEBUG_FINAL) && r == SKEIN_RND_FEED_FWD))
+ {
+ printf("\n%s",AlgoHeader(bits));
+ switch (r)
+ {
+ case SKEIN_RND_KEY_INITIAL:
+ printf(" [state after initial key injection]");
+ break;
+ case SKEIN_RND_KEY_INJECT:
+ printf(" [state after key injection #%02d]",injectNum);
+ break;
+ case SKEIN_RND_FEED_FWD:
+ printf(" [state after plaintext feedforward]");
+ injectNum = 0;
+ break;
+ }
+ printf("=\n");
+ Show64(bits/64,X);
+ if (r== SKEIN_RND_FEED_FWD)
+ printf(" ----------\n");
+ }
+ }
+ else if (skein_DebugFlag & SKEIN_DEBUG_ROUNDS)
+ {
+ uint_t j;
+ u64b_t p[SKEIN_MAX_STATE_WORDS];
+ const u08b_t *perm;
+ const static u08b_t PERM_256 [4][ 4] = { { 0,1,2,3 }, { 0,3,2,1 }, { 0,1,2,3 }, { 0,3,2,1 } };
+ const static u08b_t PERM_512 [4][ 8] = { { 0,1,2,3,4,5,6,7 },
+ { 2,1,4,7,6,5,0,3 },
+ { 4,1,6,3,0,5,2,7 },
+ { 6,1,0,7,2,5,4,3 }
+ };
+ const static u08b_t PERM_1024[4][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+ { 0, 9, 2,13, 6,11, 4,15,10, 7,12, 3,14, 5, 8, 1 },
+ { 0, 7, 2, 5, 4, 3, 6, 1,12,15,14,13, 8,11,10, 9 },
+ { 0,15, 2,11, 6,13, 4, 9,14, 1, 8, 5,10, 3,12, 7 }
+ };
+
+ if ((skein_DebugFlag & SKEIN_DEBUG_PERMUTE) && (r & 3))
+ {
+ printf("\n%s [state after round %2d (permuted)]=\n",AlgoHeader(bits),(int)r);
+ switch (bits)
+ {
+ case 256: perm = PERM_256 [r&3]; break;
+ case 512: perm = PERM_512 [r&3]; break;
+ default: perm = PERM_1024[r&3]; break;
+ }
+ for (j=0;j<bits/64;j++)
+ p[j] = X[perm[j]];
+ Show64(bits/64,p);
+ }
+ else
+ {
+ printf("\n%s [state after round %2d]=\n",AlgoHeader(bits),(int)r);
+ Show64(bits/64,X);
+ }
+ }
+ }
+ }
+
+/* show state after a round (or "pseudo-round"), given a list of pointers */
+void Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[])
+ {
+ uint_t i;
+ u64b_t X[SKEIN_MAX_STATE_WORDS];
+
+ for (i=0;i<bits/64;i++) /* copy over the words */
+ X[i] = X_ptr[i][0];
+ Skein_Show_Round(bits,h,r,X);
+ }
+
+
+/* show the state at the start of a block */
+void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+ const u64b_t *wPtr, const u64b_t *ksPtr, const u64b_t *tsPtr)
+ {
+ uint_t n;
+ if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+ if (skein_DebugFlag)
+ {
+ if (skein_DebugFlag & SKEIN_DEBUG_HDR)
+ {
+ printf("\n%s Block: outBits=%4d. T0=%06X.",AlgoHeader(bits),(uint_t) h->hashBitLen,(uint_t)h->T[0]);
+ printf(" Type=");
+ n = (uint_t) ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) >> SKEIN_T1_POS_BLK_TYPE);
+ switch (n)
+ {
+ case SKEIN_BLK_TYPE_KEY: printf("KEY. "); break;
+ case SKEIN_BLK_TYPE_CFG: printf("CFG. "); break;
+ case SKEIN_BLK_TYPE_PERS: printf("PERS."); break;
+ case SKEIN_BLK_TYPE_PK : printf("PK. "); break;
+ case SKEIN_BLK_TYPE_KDF: printf("KDF. "); break;
+ case SKEIN_BLK_TYPE_MSG: printf("MSG. "); break;
+ case SKEIN_BLK_TYPE_OUT: printf("OUT. "); break;
+ default: printf("0x%02X.",n); break;
+ }
+ printf(" Flags=");
+ printf((h->T[1] & SKEIN_T1_FLAG_FIRST) ? " First":" ");
+ printf((h->T[1] & SKEIN_T1_FLAG_FINAL) ? " Final":" ");
+ printf((h->T[1] & SKEIN_T1_FLAG_BIT_PAD) ? " Pad" :" ");
+ n = (uint_t) ((h->T[1] & SKEIN_T1_TREE_LVL_MASK) >> SKEIN_T1_POS_TREE_LVL);
+ if (n)
+ printf(" TreeLevel = %02X",n);
+ printf("\n");
+ fflush(stdout);
+ }
+ if (skein_DebugFlag & SKEIN_DEBUG_TWEAK)
+ {
+ printf(" Tweak:\n");
+ Show64(2,h->T);
+ }
+ if (skein_DebugFlag & SKEIN_DEBUG_STATE)
+ {
+ printf(" %s words:\n",(skein_DebugFlag & SKEIN_DEBUG_THREEFISH)?"Key":"State");
+ Show64(bits/64,X);
+ }
+ if (skein_DebugFlag & SKEIN_DEBUG_KEYSCHED)
+ {
+ printf(" Tweak schedule:\n");
+ Show64_flag(3,tsPtr);
+ printf(" Key schedule:\n");
+ Show64_flag((bits/64)+1,ksPtr);
+ }
+ if (skein_DebugFlag & SKEIN_DEBUG_INPUT_64)
+ {
+ printf(" Input block (words):\n");
+ Show64(bits/64,wPtr);
+ }
+ if (skein_DebugFlag & SKEIN_DEBUG_INPUT_08)
+ {
+ printf(" Input block (bytes):\n");
+ Show08(bits/8,blkPtr);
+ }
+ }
+ }
+
+void Skein_Show_Key(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes)
+ {
+ if (keyBytes)
+ if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG))
+ if (skein_DebugFlag & SKEIN_DEBUG_KEY)
+ {
+ printf("\n%s MAC key = %4u bytes\n",AlgoHeader(bits),(unsigned) keyBytes);
+ Show08(keyBytes,key);
+ }
+ }
+#endif
diff --git a/Optimized_32bit/skein_debug.h b/Optimized_32bit/skein_debug.h
new file mode 100644
index 000000000000..7775c0165c0a
--- /dev/null
+++ b/Optimized_32bit/skein_debug.h
@@ -0,0 +1,48 @@
+#ifndef _SKEIN_DEBUG_H_
+#define _SKEIN_DEBUG_H_
+/***********************************************************************
+**
+** Interface definitions for Skein hashing debug output.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#ifdef SKEIN_DEBUG
+/* callout functions used inside Skein code */
+void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr,
+ const u64b_t *wPtr,const u64b_t *ksPtr,const u64b_t *tsPtr);
+void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X);
+void Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[]);
+void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr);
+void Skein_Show_Key (uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes);
+
+extern uint_t skein_DebugFlag; /* flags to control debug output (0 --> none) */
+
+#define SKEIN_RND_SPECIAL (1000u)
+#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL+0u)
+#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL+1u)
+#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL+2u)
+
+/* flag bits: skein_DebugFlag */
+#define SKEIN_DEBUG_KEY (1u << 1) /* show MAC key */
+#define SKEIN_DEBUG_CONFIG (1u << 2) /* show config block processing */
+#define SKEIN_DEBUG_STATE (1u << 3) /* show input state during Show_Block() */
+#define SKEIN_DEBUG_TWEAK (1u << 4) /* show input state during Show_Block() */
+#define SKEIN_DEBUG_KEYSCHED (1u << 5) /* show expanded key schedule */
+#define SKEIN_DEBUG_INPUT_64 (1u << 6) /* show input block as 64-bit words */
+#define SKEIN_DEBUG_INPUT_08 (1u << 7) /* show input block as 8-bit bytes */
+#define SKEIN_DEBUG_INJECT (1u << 8) /* show state after key injection & feedforward points */
+#define SKEIN_DEBUG_ROUNDS (1u << 9) /* show state after all rounds */
+#define SKEIN_DEBUG_FINAL (1u <<10) /* show final output of Skein */
+#define SKEIN_DEBUG_HDR (1u <<11) /* show block header */
+#define SKEIN_DEBUG_THREEFISH (1u <<12) /* use Threefish name instead of Skein */
+#define SKEIN_DEBUG_PERMUTE (1u <<13) /* use word permutations */
+#define SKEIN_DEBUG_ALL ((~0u) & ~(SKEIN_DEBUG_THREEFISH | SKEIN_DEBUG_PERMUTE))
+#define THREEFISH_DEBUG_ALL (SKEIN_DEBUG_ALL | SKEIN_DEBUG_THREEFISH)
+
+#endif /* SKEIN_DEBUG */
+
+#endif /* _SKEIN_DEBUG_H_ */
diff --git a/Optimized_32bit/skein_iv.h b/Optimized_32bit/skein_iv.h
new file mode 100644
index 000000000000..a8f54a41d345
--- /dev/null
+++ b/Optimized_32bit/skein_iv.h
@@ -0,0 +1,199 @@
+#ifndef _SKEIN_IV_H_
+#define _SKEIN_IV_H_
+
+#include "skein.h" /* get Skein macros and types */
+
+/*
+***************** Pre-computed Skein IVs *******************
+**
+** NOTE: these values are not "magic" constants, but
+** are generated using the Threefish block function.
+** They are pre-computed here only for speed; i.e., to
+** avoid the need for a Threefish call during Init().
+**
+** The IV for any fixed hash length may be pre-computed.
+** Only the most common values are included here.
+**
+************************************************************
+**/
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize = 256 bits. hashSize = 128 bits */
+const u64b_t SKEIN_256_IV_128[] =
+ {
+ MK_64(0xE1111906,0x964D7260),
+ MK_64(0x883DAAA7,0x7C8D811C),
+ MK_64(0x10080DF4,0x91960F7A),
+ MK_64(0xCCF7DDE5,0xB45BC1C2)
+ };
+
+/* blkSize = 256 bits. hashSize = 160 bits */
+const u64b_t SKEIN_256_IV_160[] =
+ {
+ MK_64(0x14202314,0x72825E98),
+ MK_64(0x2AC4E9A2,0x5A77E590),
+ MK_64(0xD47A5856,0x8838D63E),
+ MK_64(0x2DD2E496,0x8586AB7D)
+ };
+
+/* blkSize = 256 bits. hashSize = 224 bits */
+const u64b_t SKEIN_256_IV_224[] =
+ {
+ MK_64(0xC6098A8C,0x9AE5EA0B),
+ MK_64(0x876D5686,0x08C5191C),
+ MK_64(0x99CB88D7,0xD7F53884),
+ MK_64(0x384BDDB1,0xAEDDB5DE)
+ };
+
+/* blkSize = 256 bits. hashSize = 256 bits */
+const u64b_t SKEIN_256_IV_256[] =
+ {
+ MK_64(0xFC9DA860,0xD048B449),
+ MK_64(0x2FCA6647,0x9FA7D833),
+ MK_64(0xB33BC389,0x6656840F),
+ MK_64(0x6A54E920,0xFDE8DA69)
+ };
+
+/* blkSize = 512 bits. hashSize = 128 bits */
+const u64b_t SKEIN_512_IV_128[] =
+ {
+ MK_64(0xA8BC7BF3,0x6FBF9F52),
+ MK_64(0x1E9872CE,0xBD1AF0AA),
+ MK_64(0x309B1790,0xB32190D3),
+ MK_64(0xBCFBB854,0x3F94805C),
+ MK_64(0x0DA61BCD,0x6E31B11B),
+ MK_64(0x1A18EBEA,0xD46A32E3),
+ MK_64(0xA2CC5B18,0xCE84AA82),
+ MK_64(0x6982AB28,0x9D46982D)
+ };
+
+/* blkSize = 512 bits. hashSize = 160 bits */
+const u64b_t SKEIN_512_IV_160[] =
+ {
+ MK_64(0x28B81A2A,0xE013BD91),
+ MK_64(0xC2F11668,0xB5BDF78F),
+ MK_64(0x1760D8F3,0xF6A56F12),
+ MK_64(0x4FB74758,0x8239904F),
+ MK_64(0x21EDE07F,0x7EAF5056),
+ MK_64(0xD908922E,0x63ED70B8),
+ MK_64(0xB8EC76FF,0xECCB52FA),
+ MK_64(0x01A47BB8,0xA3F27A6E)
+ };
+
+/* blkSize = 512 bits. hashSize = 224 bits */
+const u64b_t SKEIN_512_IV_224[] =
+ {
+ MK_64(0xCCD06162,0x48677224),
+ MK_64(0xCBA65CF3,0xA92339EF),
+ MK_64(0x8CCD69D6,0x52FF4B64),
+ MK_64(0x398AED7B,0x3AB890B4),
+ MK_64(0x0F59D1B1,0x457D2BD0),
+ MK_64(0x6776FE65,0x75D4EB3D),
+ MK_64(0x99FBC70E,0x997413E9),
+ MK_64(0x9E2CFCCF,0xE1C41EF7)
+ };
+
+/* blkSize = 512 bits. hashSize = 256 bits */
+const u64b_t SKEIN_512_IV_256[] =
+ {
+ MK_64(0xCCD044A1,0x2FDB3E13),
+ MK_64(0xE8359030,0x1A79A9EB),
+ MK_64(0x55AEA061,0x4F816E6F),
+ MK_64(0x2A2767A4,0xAE9B94DB),
+ MK_64(0xEC06025E,0x74DD7683),
+ MK_64(0xE7A436CD,0xC4746251),
+ MK_64(0xC36FBAF9,0x393AD185),
+ MK_64(0x3EEDBA18,0x33EDFC13)
+ };
+
+/* blkSize = 512 bits. hashSize = 384 bits */
+const u64b_t SKEIN_512_IV_384[] =
+ {
+ MK_64(0xA3F6C6BF,0x3A75EF5F),
+ MK_64(0xB0FEF9CC,0xFD84FAA4),
+ MK_64(0x9D77DD66,0x3D770CFE),
+ MK_64(0xD798CBF3,0xB468FDDA),
+ MK_64(0x1BC4A666,0x8A0E4465),
+ MK_64(0x7ED7D434,0xE5807407),
+ MK_64(0x548FC1AC,0xD4EC44D6),
+ MK_64(0x266E1754,0x6AA18FF8)
+ };
+
+/* blkSize = 512 bits. hashSize = 512 bits */
+const u64b_t SKEIN_512_IV_512[] =
+ {
+ MK_64(0x4903ADFF,0x749C51CE),
+ MK_64(0x0D95DE39,0x9746DF03),
+ MK_64(0x8FD19341,0x27C79BCE),
+ MK_64(0x9A255629,0xFF352CB1),
+ MK_64(0x5DB62599,0xDF6CA7B0),
+ MK_64(0xEABE394C,0xA9D5C3F4),
+ MK_64(0x991112C7,0x1A75B523),
+ MK_64(0xAE18A40B,0x660FCC33)
+ };
+
+/* blkSize = 1024 bits. hashSize = 384 bits */
+const u64b_t SKEIN1024_IV_384[] =
+ {
+ MK_64(0x5102B6B8,0xC1894A35),
+ MK_64(0xFEEBC9E3,0xFE8AF11A),
+ MK_64(0x0C807F06,0xE32BED71),
+ MK_64(0x60C13A52,0xB41A91F6),
+ MK_64(0x9716D35D,0xD4917C38),
+ MK_64(0xE780DF12,0x6FD31D3A),
+ MK_64(0x797846B6,0xC898303A),
+ MK_64(0xB172C2A8,0xB3572A3B),
+ MK_64(0xC9BC8203,0xA6104A6C),
+ MK_64(0x65909338,0xD75624F4),
+ MK_64(0x94BCC568,0x4B3F81A0),
+ MK_64(0x3EBBF51E,0x10ECFD46),
+ MK_64(0x2DF50F0B,0xEEB08542),
+ MK_64(0x3B5A6530,0x0DBC6516),
+ MK_64(0x484B9CD2,0x167BBCE1),
+ MK_64(0x2D136947,0xD4CBAFEA)
+ };
+
+/* blkSize = 1024 bits. hashSize = 512 bits */
+const u64b_t SKEIN1024_IV_512[] =
+ {
+ MK_64(0xCAEC0E5D,0x7C1B1B18),
+ MK_64(0xA01B0E04,0x5F03E802),
+ MK_64(0x33840451,0xED912885),
+ MK_64(0x374AFB04,0xEAEC2E1C),
+ MK_64(0xDF25A0E2,0x813581F7),
+ MK_64(0xE4004093,0x8B12F9D2),
+ MK_64(0xA662D539,0xC2ED39B6),
+ MK_64(0xFA8B85CF,0x45D8C75A),
+ MK_64(0x8316ED8E,0x29EDE796),
+ MK_64(0x053289C0,0x2E9F91B8),
+ MK_64(0xC3F8EF1D,0x6D518B73),
+ MK_64(0xBDCEC3C4,0xD5EF332E),
+ MK_64(0x549A7E52,0x22974487),
+ MK_64(0x67070872,0x5B749816),
+ MK_64(0xB9CD28FB,0xF0581BD1),
+ MK_64(0x0E2940B8,0x15804974)
+ };
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const u64b_t SKEIN1024_IV_1024[] =
+ {
+ MK_64(0xD593DA07,0x41E72355),
+ MK_64(0x15B5E511,0xAC73E00C),
+ MK_64(0x5180E5AE,0xBAF2C4F0),
+ MK_64(0x03BD41D3,0xFCBCAFAF),
+ MK_64(0x1CAEC6FD,0x1983A898),
+ MK_64(0x6E510B8B,0xCDD0589F),
+ MK_64(0x77E2BDFD,0xC6394ADA),
+ MK_64(0xC11E1DB5,0x24DCB0A3),
+ MK_64(0xD6D14AF9,0xC6329AB5),
+ MK_64(0x6A9B0BFC,0x6EB67E0D),
+ MK_64(0x9243C60D,0xCCFF1332),
+ MK_64(0x1A1F1DDE,0x743F02D4),
+ MK_64(0x0996753C,0x10ED0BB8),
+ MK_64(0x6572DD22,0xF2B4969A),
+ MK_64(0x61FD3062,0xD00A579A),
+ MK_64(0x1DE0536E,0x8682E539)
+ };
+
+#endif /* _SKEIN_IV_H_ */
diff --git a/Optimized_32bit/skein_port.h b/Optimized_32bit/skein_port.h
new file mode 100644
index 000000000000..653302de7467
--- /dev/null
+++ b/Optimized_32bit/skein_port.h
@@ -0,0 +1,124 @@
+#ifndef _SKEIN_PORT_H_
+#define _SKEIN_PORT_H_
+/*******************************************************************
+**
+** Platform-specific definitions for Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+** Many thanks to Brian Gladman for his portable header files.
+**
+** To port Skein to an "unsupported" platform, change the definitions
+** in this file appropriately.
+**
+********************************************************************/
+
+#include "brg_types.h" /* get integer type definitions */
+
+typedef unsigned int uint_t; /* native unsigned integer */
+typedef uint_8t u08b_t; /* 8-bit unsigned integer */
+typedef uint_64t u64b_t; /* 64-bit unsigned integer */
+
+#ifndef RotL_64
+#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs. The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ * SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian
+ * Skein_Put64_LSB_First
+ * Skein_Get64_LSB_First
+ * Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */
+
+#include "brg_endian.h" /* get endianness selection */
+#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+ /* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP (1)
+#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+ /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP (0)
+#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
+#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+#endif
+#else
+#error "Skein needs endianness setting!"
+#endif
+
+#endif /* ifndef SKEIN_NEED_SWAP */
+
+/*
+ ******************************************************************
+ * Provide any definitions still needed.
+ ******************************************************************
+ */
+#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */
+#if SKEIN_NEED_SWAP
+#define Skein_Swap64(w64) \
+ ( (( ((u64b_t)(w64)) & 0xFF) << 56) | \
+ (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \
+ (((((u64b_t)(w64)) >>16) & 0xFF) << 40) | \
+ (((((u64b_t)(w64)) >>24) & 0xFF) << 32) | \
+ (((((u64b_t)(w64)) >>32) & 0xFF) << 24) | \
+ (((((u64b_t)(w64)) >>40) & 0xFF) << 16) | \
+ (((((u64b_t)(w64)) >>48) & 0xFF) << 8) | \
+ (((((u64b_t)(w64)) >>56) & 0xFF) ) )
+#else
+#define Skein_Swap64(w64) (w64)
+#endif
+#endif /* ifndef Skein_Swap64 */
+
+
+#ifndef Skein_Put64_LSB_First
+void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
+#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
+ { /* this version is fully portable (big-endian or little-endian), but slow */
+ size_t n;
+
+ for (n=0;n<bCnt;n++)
+ dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
+ }
+#else
+ ; /* output only the function prototype */
+#endif
+#endif /* ifndef Skein_Put64_LSB_First */
+
+
+#ifndef Skein_Get64_LSB_First
+void Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
+#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
+ { /* this version is fully portable (big-endian or little-endian), but slow */
+ size_t n;
+
+ for (n=0;n<8*wCnt;n+=8)
+ dst[n/8] = (((u64b_t) src[n ]) ) +
+ (((u64b_t) src[n+1]) << 8) +
+ (((u64b_t) src[n+2]) << 16) +
+ (((u64b_t) src[n+3]) << 24) +
+ (((u64b_t) src[n+4]) << 32) +
+ (((u64b_t) src[n+5]) << 40) +
+ (((u64b_t) src[n+6]) << 48) +
+ (((u64b_t) src[n+7]) << 56) ;
+ }
+#else
+ ; /* output only the function prototype */
+#endif
+#endif /* ifndef Skein_Get64_LSB_First */
+
+#endif /* ifndef _SKEIN_PORT_H_ */
diff --git a/Optimized_64bit/SHA3api_ref.c b/Optimized_64bit/SHA3api_ref.c
new file mode 100644
index 000000000000..6861a3e4bffb
--- /dev/null
+++ b/Optimized_64bit/SHA3api_ref.c
@@ -0,0 +1,115 @@
+/***********************************************************************
+**
+** Implementation of the AHS API using the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include <string.h> /* get the memcpy/memset functions */
+#include "skein.h" /* get the Skein API definitions */