diff options
author | Jung-uk Kim <jkim@FreeBSD.org> | 2016-09-22 13:04:03 +0000 |
---|---|---|
committer | Jung-uk Kim <jkim@FreeBSD.org> | 2016-09-22 13:04:03 +0000 |
commit | e1b483878d9824c63d376895da633b0b96fbbaed (patch) | |
tree | 0846e185ed4cc1159a684e408e772c86ae0fc1a7 /crypto/bn | |
parent | 57f1256b1a087adbdf8e5c080dd9ed7975de939a (diff) | |
download | src-e1b483878d9824c63d376895da633b0b96fbbaed.tar.gz src-e1b483878d9824c63d376895da633b0b96fbbaed.zip |
Import OpenSSL 1.0.2i.vendor/openssl/1.0.2i
Notes
Notes:
svn path=/vendor-crypto/openssl/dist/; revision=306189
svn path=/vendor-crypto/openssl/1.0.2i/; revision=306190; tag=vendor/openssl/1.0.2i
Diffstat (limited to 'crypto/bn')
-rwxr-xr-x | crypto/bn/asm/x86-mont.pl | 41 | ||||
-rw-r--r-- | crypto/bn/asm/x86_64-gcc.c | 2 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont.pl | 185 | ||||
-rwxr-xr-x | crypto/bn/asm/x86_64-mont5.pl | 227 | ||||
-rw-r--r-- | crypto/bn/bn.h | 2 | ||||
-rw-r--r-- | crypto/bn/bn_div.c | 4 | ||||
-rw-r--r-- | crypto/bn/bn_lib.c | 2 | ||||
-rw-r--r-- | crypto/bn/bn_print.c | 35 | ||||
-rw-r--r-- | crypto/bn/bn_rand.c | 23 | ||||
-rw-r--r-- | crypto/bn/bn_word.c | 22 | ||||
-rw-r--r-- | crypto/bn/bntest.c | 8 |
11 files changed, 341 insertions, 210 deletions
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 89f4de61e896..1c4003efc20a 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -63,27 +63,26 @@ $frame=32; # size of above frame rounded up to 16n &lea ("esi",&wparam(0)); # put aside pointer to argument block &lea ("edx",&wparam(1)); # load ap - &mov ("ebp","esp"); # saved stack pointer! &add ("edi",2); # extra two words on top of tp &neg ("edi"); - &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) &neg ("edi"); # minimize cache contention by arraning 2K window between stack # pointer and ap argument [np is also position sensitive vector, # but it's assumed to be near ap, as it's allocated at ~same # time]. - &mov ("eax","esp"); + &mov ("eax","ebp"); &sub ("eax","edx"); &and ("eax",2047); - &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 - &xor ("edx","esp"); + &xor ("edx","ebp"); &and ("edx",2048); &xor ("edx",2048); - &sub ("esp","edx"); # this splits them apart modulo 4096 + &sub ("ebp","edx"); # this splits them apart modulo 4096 - &and ("esp",-64); # align to cache line + &and ("ebp",-64); # align to cache line # Some OSes, *cough*-dows, insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack @@ -91,20 +90,28 @@ $frame=32; # size of above frame rounded up to 16n # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - &mov ("eax","ebp"); - &sub ("eax","esp"); + &mov ("eax","esp"); + &sub ("eax","ebp"); &and ("eax",-4096); -&set_label("page_walk"); - &mov ("edx",&DWP(0,"esp","eax")); - &sub ("eax",4096); - &data_byte(0x2e); - &jnc (&label("page_walk")); + &mov ("edx","esp"); # saved stack pointer! + &lea ("esp",&DWP(0,"ebp","eax")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); + &jmp (&label("page_walk_done")); + +&set_label("page_walk",16); + &lea ("esp",&DWP(-4096,"esp")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); +&set_label("page_walk_done"); ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp - &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 #&mov ("edi",&DWP(5*4,"esi"));# int num @@ -112,11 +119,11 @@ $frame=32; # size of above frame rounded up to 16n &mov ($_rp,"eax"); # ... save a copy of argument block &mov ($_ap,"ebx"); &mov ($_bp,"ecx"); - &mov ($_np,"edx"); + &mov ($_np,"ebp"); &mov ($_n0,"esi"); &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling #&mov ($_num,$num); # redundant as $num is not reused - &mov ($_sp,"ebp"); # saved stack pointer! + &mov ($_sp,"edx"); # saved stack pointer! if($sse2) { $acc0="mm0"; # mmx register bank layout diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c index d77dc433d405..1729b479d43e 100644 --- a/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/bn/asm/x86_64-gcc.c @@ -194,7 +194,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) BN_ULONG ret, waste; asm("divq %4":"=a"(ret), "=d"(waste) - : "a"(l), "d"(h), "g"(d) + : "a"(l), "d"(h), "r"(d) : "cc"); return ret; diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 8fb6c994e1ef..044fd7ecc0fd 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -97,6 +97,8 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: + mov ${num}d,${num}d + mov %rsp,%rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d @@ -121,29 +123,36 @@ $code.=<<___; push %r14 push %r15 - mov ${num}d,${num}d - lea 2($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) - and \$-1024,%rsp # minimize TLB usage + lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul_body: # Some OSes, *cough*-dows, insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - sub %rsp,%r11 + sub %r10,%r11 and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 .Lmul_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x66,0x2e # predict non-taken - jnc .Lmul_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: mov $bp,%r12 # reassign $bp ___ $bp="%r12"; @@ -314,13 +323,13 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul_epilogue: ret .size bn_mul_mont,.-bn_mul_mont @@ -332,6 +341,8 @@ $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: + mov ${num}d,${num}d + mov %rsp,%rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -347,23 +358,29 @@ $code.=<<___; push %r14 push %r15 - mov ${num}d,${num}d - lea 4($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) - and \$-1024,%rsp # minimize TLB usage + lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) + neg $num # restore + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul4x_body: - sub %rsp,%r11 + sub %r10,%r11 and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmul4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ @@ -742,13 +759,13 @@ ___ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul4x_epilogue: ret .size bn_mul4x_mont,.-bn_mul4x_mont @@ -778,14 +795,15 @@ $code.=<<___; .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: mov %rsp,%rax +.Lsqr8x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes @@ -798,33 +816,42 @@ bn_sqr8x_mont: # do its job. # lea -64(%rsp,$num,2),%r11 + mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lsqr8x_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 .Lsqr8x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lsqr8x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: mov $num,%r10 neg $num @@ -948,30 +975,38 @@ $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: -.Lmulx4x_enter: mov %rsp,%rax +.Lmulx4x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes - .byte 0x67 xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 - lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) - and \$-128,%rsp - mov %rax,%r11 - sub %rsp,%r11 + lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) + and \$-128,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 .Lmulx4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x66,0x2e # predict non-taken - jnc .Lmulx4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: lea ($bp,$num),%r10 ############################################################## @@ -1332,22 +1367,8 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - jmp .Lcommon_seh_tail + jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent @@ -1375,15 +1396,21 @@ sqr_handler: cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # body label + cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -1470,13 +1497,15 @@ $code.=<<___; .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] + .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +.align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 ___ } diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 938e17081803..f1fbb45b532b 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -86,6 +86,8 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: + mov ${num}d,${num}d + mov %rsp,%rax test \$7,${num}d jnz .Lmul_enter ___ @@ -97,10 +99,7 @@ $code.=<<___; .align 16 .Lmul_enter: - mov ${num}d,${num}d - mov %rsp,%rax movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument - lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 @@ -108,26 +107,36 @@ $code.=<<___; push %r14 push %r15 - lea 2($num),%r11 - neg %r11 - lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) - and \$-1024,%rsp # minimize TLB usage + neg $num + mov %rsp,%r11 + lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lmul_body: # Some OSes, *cough*-dows, insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... - sub %rsp,%rax - and \$-4096,%rax + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + .Lmul_page_walk: - mov (%rsp,%rax),%r11 - sub \$4096,%rax - .byte 0x2e # predict non-taken - jnc .Lmul_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + lea .Linc(%rip),%r10 + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) ___ @@ -433,6 +442,8 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: + .byte 0x67 + mov %rsp,%rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -441,14 +452,13 @@ $code.=<<___ if ($addx); je .Lmulx4x_enter ___ $code.=<<___; - .byte 0x67 - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmul4x_prologue: .byte 0x67 shl \$3,${num}d # convert $num to bytes @@ -465,32 +475,40 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $rp - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $rp + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmul4xsp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + .Lmul4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmul4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: neg $num @@ -1034,6 +1052,7 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: + mov %rsp,%rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -1042,13 +1061,13 @@ $code.=<<___ if ($addx); je .Lpowerx5_enter ___ $code.=<<___; - mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lpower5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num @@ -1063,32 +1082,40 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwr_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + .Lpwr_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lpwr_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: mov $num,%r10 neg $num @@ -2028,6 +2055,7 @@ bn_from_mont8x: push %r13 push %r14 push %r15 +.Lfrom_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2042,32 +2070,40 @@ bn_from_mont8x: # last operation, we use the opportunity to cleanse it. # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lfrom_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + .Lfrom_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lfrom_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: mov $num,%r10 neg $num @@ -2173,14 +2209,15 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: -.Lmulx4x_enter: mov %rsp,%rax +.Lmulx4x_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2197,31 +2234,39 @@ bn_mulx4x_mont_gather5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmulx4xsp_done: - and \$-64,%rsp # ensure alignment - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp # ensure alignment + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + .Lmulx4x_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lmulx4x_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: ############################################################## # Stack layout @@ -2629,14 +2674,15 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: -.Lpowerx5_enter: mov %rsp,%rax +.Lpowerx5_enter: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 +.Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2651,32 +2697,40 @@ bn_powerx5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwrx_sp_done: - and \$-64,%rsp - mov %rax,%r11 - sub %rsp,%r11 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + .Lpwrx_page_walk: - mov (%rsp,%r11),%r10 - sub \$4096,%r11 - .byte 0x2e # predict non-taken - jnc .Lpwrx_page_walk + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: mov $num,%r10 neg $num @@ -3607,9 +3661,14 @@ mul_handler: cmp %r10,%rbx # context->Rip<end of prologue label jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail @@ -3621,11 +3680,11 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - jmp .Lbody_proceed + jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer -.Lbody_proceed: +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3716,34 +3775,34 @@ $code.=<<___; .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] + .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] + .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler - .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] + .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] + .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 86264ae6315f..633d1b1f6013 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -842,6 +842,8 @@ int RAND_pseudo_bytes(unsigned char *buf, int num); if (*(ftl--)) break; \ (a)->top = tmp_top; \ } \ + if ((a)->top == 0) \ + (a)->neg = 0; \ bn_pollute(a); \ } diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c index 72e6ce3f74c0..bc37671cf138 100644 --- a/crypto/bn/bn_div.c +++ b/crypto/bn/bn_div.c @@ -155,7 +155,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, ({ asm volatile ( \ "divl %4" \ : "=a"(q), "=d"(rem) \ - : "a"(n1), "d"(n0), "g"(d0) \ + : "a"(n1), "d"(n0), "r"(d0) \ : "cc"); \ q; \ }) @@ -170,7 +170,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, ({ asm volatile ( \ "divq %4" \ : "=a"(q), "=d"(rem) \ - : "a"(n1), "d"(n0), "g"(d0) \ + : "a"(n1), "d"(n0), "r"(d0) \ : "cc"); \ q; \ }) diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c index 80105fff410c..10b78f512607 100644 --- a/crypto/bn/bn_lib.c +++ b/crypto/bn/bn_lib.c @@ -569,7 +569,7 @@ void BN_clear(BIGNUM *a) { bn_check_top(a); if (a->d != NULL) - memset(a->d, 0, a->dmax * sizeof(a->d[0])); + OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0])); a->top = 0; a->neg = 0; } diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c index bfa31efc5621..f121fb6e9a08 100644 --- a/crypto/bn/bn_print.c +++ b/crypto/bn/bn_print.c @@ -72,12 +72,9 @@ char *BN_bn2hex(const BIGNUM *a) char *buf; char *p; - if (a->neg && BN_is_zero(a)) { - /* "-0" == 3 bytes including NULL terminator */ - buf = OPENSSL_malloc(3); - } else { - buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2); - } + if (BN_is_zero(a)) + return OPENSSL_strdup("0"); + buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2); if (buf == NULL) { BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE); goto err; @@ -111,6 +108,7 @@ char *BN_bn2dec(const BIGNUM *a) char *p; BIGNUM *t = NULL; BN_ULONG *bn_data = NULL, *lp; + int bn_data_num; /*- * get an upper bound for the length of the decimal integer @@ -120,9 +118,9 @@ char *BN_bn2dec(const BIGNUM *a) */ i = BN_num_bits(a) * 3; num = (i / 10 + i / 1000 + 1) + 1; - bn_data = - (BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG)); - buf = (char *)OPENSSL_malloc(num + 3); + bn_data_num = num / BN_DEC_NUM + 1; + bn_data = OPENSSL_malloc(bn_data_num * sizeof(BN_ULONG)); + buf = OPENSSL_malloc(num + 3); if ((buf == NULL) || (bn_data == NULL)) { BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE); goto err; @@ -140,9 +138,12 @@ char *BN_bn2dec(const BIGNUM *a) if (BN_is_negative(t)) *p++ = '-'; - i = 0; while (!BN_is_zero(t)) { + if (lp - bn_data >= bn_data_num) + goto err; *lp = BN_div_word(t, BN_DEC_CONV); + if (*lp == (BN_ULONG)-1) + goto err; lp++; } lp--; @@ -240,10 +241,12 @@ int BN_hex2bn(BIGNUM **bn, const char *a) } ret->top = h; bn_correct_top(ret); - ret->neg = neg; *bn = ret; bn_check_top(ret); + /* Don't set the negative flag if it's zero. */ + if (ret->top != 0) + ret->neg = neg; return (num); err: if (*bn == NULL) @@ -295,7 +298,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a) if (j == BN_DEC_NUM) j = 0; l = 0; - while (*a) { + while (--i >= 0) { l *= 10; l += *a - '0'; a++; @@ -306,11 +309,13 @@ int BN_dec2bn(BIGNUM **bn, const char *a) j = 0; } } - ret->neg = neg; bn_correct_top(ret); *bn = ret; bn_check_top(ret); + /* Don't set the negative flag if it's zero. */ + if (ret->top != 0) + ret->neg = neg; return (num); err: if (*bn == NULL) @@ -321,6 +326,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a) int BN_asc2bn(BIGNUM **bn, const char *a) { const char *p = a; + if (*p == '-') p++; @@ -331,7 +337,8 @@ int BN_asc2bn(BIGNUM **bn, const char *a) if (!BN_dec2bn(bn, p)) return 0; } - if (*a == '-') + /* Don't set the negative flag if it's zero. */ + if (*a == '-' && (*bn)->top != 0) (*bn)->neg = 1; return 1; } diff --git a/crypto/bn/bn_rand.c b/crypto/bn/bn_rand.c index f9fb2e9e45e0..60d3f2260ba1 100644 --- a/crypto/bn/bn_rand.c +++ b/crypto/bn/bn_rand.c @@ -121,15 +121,14 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom) int ret = 0, bit, bytes, mask; time_t tim; - if (bits < 0 || (bits == 1 && top > 0)) { - BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL); - return 0; - } - if (bits == 0) { + if (top != -1 || bottom != 0) + goto toosmall; BN_zero(rnd); return 1; } + if (bits < 0 || (bits == 1 && top > 0)) + goto toosmall; bytes = (bits + 7) / 8; bit = (bits - 1) % 8; @@ -145,13 +144,9 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom) time(&tim); RAND_add(&tim, sizeof(tim), 0.0); - if (pseudorand) { - if (RAND_pseudo_bytes(buf, bytes) == -1) - goto err; - } else { - if (RAND_bytes(buf, bytes) <= 0) - goto err; - } + /* We ignore the value of pseudorand and always call RAND_bytes */ + if (RAND_bytes(buf, bytes) <= 0) + goto err; #if 1 if (pseudorand == 2) { @@ -199,6 +194,10 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom) } bn_check_top(rnd); return (ret); + +toosmall: + BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL); + return 0; } int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c index b031a60b5bf8..9b5f9cb98c3a 100644 --- a/crypto/bn/bn_word.c +++ b/crypto/bn/bn_word.c @@ -72,10 +72,32 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) if (w == 0) return (BN_ULONG)-1; +#ifndef BN_LLONG + /* + * If |w| is too long and we don't have BN_ULLONG then we need to fall + * back to using BN_div_word + */ + if (w > ((BN_ULONG)1 << BN_BITS4)) { + BIGNUM *tmp = BN_dup(a); + if (tmp == NULL) + return (BN_ULONG)-1; + + ret = BN_div_word(tmp, w); + BN_free(tmp); + + return ret; + } +#endif + bn_check_top(a); w &= BN_MASK2; for (i = a->top - 1; i >= 0; i--) { #ifndef BN_LLONG + /* + * We can assume here that | w <= ((BN_ULONG)1 << BN_BITS4) | and so + * | ret < ((BN_ULONG)1 << BN_BITS4) | and therefore the shifts here are + * safe and will not overflow + */ ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w; ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w; #else diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c index 1e35988022bb..a327b1a647b2 100644 --- a/crypto/bn/bntest.c +++ b/crypto/bn/bntest.c @@ -514,7 +514,7 @@ static void print_word(BIO *bp, BN_ULONG w) int test_div_word(BIO *bp) { BIGNUM a, b; - BN_ULONG r, s; + BN_ULONG r, rmod, s; int i; BN_init(&a); @@ -528,8 +528,14 @@ int test_div_word(BIO *bp) s = b.d[0]; BN_copy(&b, &a); + rmod = BN_mod_word(&b, s); r = BN_div_word(&b, s); + if (rmod != r) { + fprintf(stderr, "Mod (word) test failed!\n"); + return 0; + } + if (bp != NULL) { if (!results) { BN_print(bp, &a); |