src - FreeBSD source tree

diff options


context:
space:
mode:

author	Jung-uk Kim <jkim@FreeBSD.org>	2016-09-22 13:04:03 +0000
committer	Jung-uk Kim <jkim@FreeBSD.org>	2016-09-22 13:04:03 +0000
commit	e1b483878d9824c63d376895da633b0b96fbbaed (patch)
tree	0846e185ed4cc1159a684e408e772c86ae0fc1a7 /crypto/bn
parent	57f1256b1a087adbdf8e5c080dd9ed7975de939a (diff)
download	src-e1b483878d9824c63d376895da633b0b96fbbaed.tar.gz src-e1b483878d9824c63d376895da633b0b96fbbaed.zip

Import OpenSSL 1.0.2i.vendor/openssl/1.0.2i

Notes

Notes: svn path=/vendor-crypto/openssl/dist/; revision=306189 svn path=/vendor-crypto/openssl/1.0.2i/; revision=306190; tag=vendor/openssl/1.0.2i

Diffstat (limited to 'crypto/bn')

-rwxr-xr-x

crypto/bn/asm/x86-mont.pl

-rw-r--r--

crypto/bn/asm/x86_64-gcc.c

-rwxr-xr-x

crypto/bn/asm/x86_64-mont.pl

185

-rwxr-xr-x

crypto/bn/asm/x86_64-mont5.pl

227

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

11 files changed, 341 insertions, 210 deletions

diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index 89f4de61e896..1c4003efc20a 100755
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl

@@ -63,27 +63,26 @@ $frame=32; # size of above frame rounded up to 16n

&lea ("esi",&wparam(0)); # put aside pointer to argument block

&lea ("edx",&wparam(1)); # load ap

- &mov ("ebp","esp"); # saved stack pointer!

&add ("edi",2); # extra two words on top of tp

&neg ("edi");

- &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))

+ &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))

&neg ("edi");

# minimize cache contention by arraning 2K window between stack

# pointer and ap argument [np is also position sensitive vector,

# but it's assumed to be near ap, as it's allocated at ~same

# time].

- &mov ("eax","esp");

+ &mov ("eax","ebp");

&sub ("eax","edx");

&and ("eax",2047);

- &sub ("esp","eax"); # this aligns sp and ap modulo 2048

+ &sub ("ebp","eax"); # this aligns sp and ap modulo 2048

- &xor ("edx","esp");

+ &xor ("edx","ebp");

&and ("edx",2048);

&xor ("edx",2048);

- &sub ("esp","edx"); # this splits them apart modulo 4096

+ &sub ("ebp","edx"); # this splits them apart modulo 4096

- &and ("esp",-64); # align to cache line

+ &and ("ebp",-64); # align to cache line

# Some OSes, *cough*-dows, insist on stack being "wired" to

# physical memory in strictly sequential manner, i.e. if stack

@@ -91,20 +90,28 @@ $frame=32; # size of above frame rounded up to 16n

# be punishable by SEGV. But page walking can do good even on

# other OSes, because it guarantees that villain thread hits

# the guard page before it can make damage to innocent one...

- &mov ("eax","ebp");

- &sub ("eax","esp");

+ &mov ("eax","esp");

+ &sub ("eax","ebp");

&and ("eax",-4096);

-&set_label("page_walk");

- &mov ("edx",&DWP(0,"esp","eax"));

- &sub ("eax",4096);

- &data_byte(0x2e);

- &jnc (&label("page_walk"));

+ &mov ("edx","esp"); # saved stack pointer!

+ &lea ("esp",&DWP(0,"ebp","eax"));

+ &mov ("eax",&DWP(0,"esp"));

+ &cmp ("esp","ebp");

+ &ja (&label("page_walk"));

+ &jmp (&label("page_walk_done"));

+&set_label("page_walk",16);

+ &lea ("esp",&DWP(-4096,"esp"));

+ &mov ("eax",&DWP(0,"esp"));

+ &cmp ("esp","ebp");

+ &ja (&label("page_walk"));

+&set_label("page_walk_done");

################################# load argument block...

&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp

&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap

&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp

- &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np

+ &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np

&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0

#&mov ("edi",&DWP(5*4,"esi"));# int num

@@ -112,11 +119,11 @@ $frame=32; # size of above frame rounded up to 16n

&mov ($_rp,"eax"); # ... save a copy of argument block

&mov ($_ap,"ebx");

&mov ($_bp,"ecx");

- &mov ($_np,"edx");

+ &mov ($_np,"ebp");

&mov ($_n0,"esi");

&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling

#&mov ($_num,$num); # redundant as $num is not reused

- &mov ($_sp,"ebp"); # saved stack pointer!

+ &mov ($_sp,"edx"); # saved stack pointer!

if($sse2) {

$acc0="mm0"; # mmx register bank layout

diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d77dc433d405..1729b479d43e 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c

@@ -194,7 +194,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)

BN_ULONG ret, waste;

asm("divq %4":"=a"(ret), "=d"(waste)

- : "a"(l), "d"(h), "g"(d)

+ : "a"(l), "d"(h), "r"(d)

: "cc");

return ret;

diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 8fb6c994e1ef..044fd7ecc0fd 100755
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl

@@ -97,6 +97,8 @@ $code=<<___;

.type bn_mul_mont,\@function,6

.align 16

bn_mul_mont:

+ mov ${num}d,${num}d

+ mov %rsp,%rax

test \$3,${num}d

jnz .Lmul_enter

cmp \$8,${num}d

@@ -121,29 +123,36 @@ $code.=<<___;

push %r14

push %r15

- mov ${num}d,${num}d

- lea 2($num),%r10

+ neg $num

mov %rsp,%r11

- neg %r10

- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))

- and \$-1024,%rsp # minimize TLB usage

+ lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))

+ neg $num # restore $num

+ and \$-1024,%r10 # minimize TLB usage

- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp

-.Lmul_body:

# Some OSes, *cough*-dows, insist on stack being "wired" to

# physical memory in strictly sequential manner, i.e. if stack

# allocation spans two pages, then reference to farmost one can

# be punishable by SEGV. But page walking can do good even on

# other OSes, because it guarantees that villain thread hits

# the guard page before it can make damage to innocent one...

- sub %rsp,%r11

+ sub %r10,%r11

and \$-4096,%r11

+ lea (%r10,%r11),%rsp

+ mov (%rsp),%r11

+ cmp %r10,%rsp

+ ja .Lmul_page_walk

+ jmp .Lmul_page_walk_done

+.align 16

.Lmul_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x66,0x2e # predict non-taken

- jnc .Lmul_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r11

+ cmp %r10,%rsp

+ ja .Lmul_page_walk

+.Lmul_page_walk_done:

+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp

+.Lmul_body:

mov $bp,%r12 # reassign $bp

___

$bp="%r12";

@@ -314,13 +323,13 @@ $code.=<<___;

mov 8(%rsp,$num,8),%rsi # restore %rsp

mov \$1,%rax

- mov (%rsi),%r15

- mov 8(%rsi),%r14

- mov 16(%rsi),%r13

- mov 24(%rsi),%r12

- mov 32(%rsi),%rbp

- mov 40(%rsi),%rbx

- lea 48(%rsi),%rsp

+ mov -48(%rsi),%r15

+ mov -40(%rsi),%r14

+ mov -32(%rsi),%r13

+ mov -24(%rsi),%r12

+ mov -16(%rsi),%rbp

+ mov -8(%rsi),%rbx

+ lea (%rsi),%rsp

.Lmul_epilogue:

ret

.size bn_mul_mont,.-bn_mul_mont

@@ -332,6 +341,8 @@ $code.=<<___;

.type bn_mul4x_mont,\@function,6

.align 16

bn_mul4x_mont:

+ mov ${num}d,${num}d

+ mov %rsp,%rax

.Lmul4x_enter:

___

$code.=<<___ if ($addx);

@@ -347,23 +358,29 @@ $code.=<<___;

push %r14

push %r15

- mov ${num}d,${num}d

- lea 4($num),%r10

+ neg $num

mov %rsp,%r11

- neg %r10

- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))

- and \$-1024,%rsp # minimize TLB usage

+ lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))

+ neg $num # restore

+ and \$-1024,%r10 # minimize TLB usage

- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp

-.Lmul4x_body:

- sub %rsp,%r11

+ sub %r10,%r11

and \$-4096,%r11

+ lea (%r10,%r11),%rsp

+ mov (%rsp),%r11

+ cmp %r10,%rsp

+ ja .Lmul4x_page_walk

+ jmp .Lmul4x_page_walk_done

.Lmul4x_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lmul4x_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r11

+ cmp %r10,%rsp

+ ja .Lmul4x_page_walk

+.Lmul4x_page_walk_done:

+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp

+.Lmul4x_body:

mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp

mov %rdx,%r12 # reassign $bp

___

@@ -742,13 +759,13 @@ ___

$code.=<<___;

mov 8(%rsp,$num,8),%rsi # restore %rsp

mov \$1,%rax

- mov (%rsi),%r15

- mov 8(%rsi),%r14

- mov 16(%rsi),%r13

- mov 24(%rsi),%r12

- mov 32(%rsi),%rbp

- mov 40(%rsi),%rbx

- lea 48(%rsi),%rsp

+ mov -48(%rsi),%r15

+ mov -40(%rsi),%r14

+ mov -32(%rsi),%r13

+ mov -24(%rsi),%r12

+ mov -16(%rsi),%rbp

+ mov -8(%rsi),%rbx

+ lea (%rsi),%rsp

.Lmul4x_epilogue:

ret

.size bn_mul4x_mont,.-bn_mul4x_mont

@@ -778,14 +795,15 @@ $code.=<<___;

.type bn_sqr8x_mont,\@function,6

.align 32

bn_sqr8x_mont:

-.Lsqr8x_enter:

mov %rsp,%rax

+.Lsqr8x_enter:

push %rbx

push %rbp

push %r12

push %r13

push %r14

push %r15

+.Lsqr8x_prologue:

mov ${num}d,%r10d

shl \$3,${num}d # convert $num to bytes

@@ -798,33 +816,42 @@ bn_sqr8x_mont:

# do its job.

lea -64(%rsp,$num,2),%r11

+ mov %rsp,%rbp

mov ($n0),$n0 # *n0

sub $aptr,%r11

and \$4095,%r11

cmp %r11,%r10

jb .Lsqr8x_sp_alt

- sub %r11,%rsp # align with $aptr

- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)

+ sub %r11,%rbp # align with $aptr

+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)

jmp .Lsqr8x_sp_done

.align 32

.Lsqr8x_sp_alt:

lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num

- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)

+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)

sub %r10,%r11

mov \$0,%r10

cmovc %r10,%r11

- sub %r11,%rsp

+ sub %r11,%rbp

.Lsqr8x_sp_done:

- and \$-64,%rsp

- mov %rax,%r11

- sub %rsp,%r11

+ and \$-64,%rbp

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lsqr8x_page_walk

+ jmp .Lsqr8x_page_walk_done

+.align 16

.Lsqr8x_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lsqr8x_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lsqr8x_page_walk

+.Lsqr8x_page_walk_done:

mov $num,%r10

neg $num

@@ -948,30 +975,38 @@ $code.=<<___;

.type bn_mulx4x_mont,\@function,6

.align 32

bn_mulx4x_mont:

-.Lmulx4x_enter:

mov %rsp,%rax

+.Lmulx4x_enter:

push %rbx

push %rbp

push %r12

push %r13

push %r14

push %r15

+.Lmulx4x_prologue:

shl \$3,${num}d # convert $num to bytes

- .byte 0x67

xor %r10,%r10

sub $num,%r10 # -$num

mov ($n0),$n0 # *n0

- lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)

- and \$-128,%rsp

- mov %rax,%r11

- sub %rsp,%r11

+ lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)

+ and \$-128,%rbp

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lmulx4x_page_walk

+ jmp .Lmulx4x_page_walk_done

+.align 16

.Lmulx4x_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x66,0x2e # predict non-taken

- jnc .Lmulx4x_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lmulx4x_page_walk

+.Lmulx4x_page_walk_done:

lea ($bp,$num),%r10

##############################################################

@@ -1332,22 +1367,8 @@ mul_handler:

mov 192($context),%r10 # pull $num

mov 8(%rax,%r10,8),%rax # pull saved stack pointer

- lea 48(%rax),%rax

- mov -8(%rax),%rbx

- mov -16(%rax),%rbp

- mov -24(%rax),%r12

- mov -32(%rax),%r13

- mov -40(%rax),%r14

- mov -48(%rax),%r15

- mov %rbx,144($context) # restore context->Rbx

- mov %rbp,160($context) # restore context->Rbp

- mov %r12,216($context) # restore context->R12

- mov %r13,224($context) # restore context->R13

- mov %r14,232($context) # restore context->R14

- mov %r15,240($context) # restore context->R15

- jmp .Lcommon_seh_tail

+ jmp .Lcommon_pop_regs

.size mul_handler,.-mul_handler

.type sqr_handler,\@abi-omnipotent

@@ -1375,15 +1396,21 @@ sqr_handler:

cmp %r10,%rbx # context->Rip<.Lsqr_body

jb .Lcommon_seh_tail

+ mov 4(%r11),%r10d # HandlerData[1]

+ lea (%rsi,%r10),%r10 # body label

+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue

+ jb .Lcommon_pop_regs

mov 152($context),%rax # pull context->Rsp

- mov 4(%r11),%r10d # HandlerData[1]

+ mov 8(%r11),%r10d # HandlerData[2]

lea (%rsi,%r10),%r10 # epilogue label

cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue

jae .Lcommon_seh_tail

mov 40(%rax),%rax # pull saved stack pointer

+.Lcommon_pop_regs:

mov -8(%rax),%rbx

mov -16(%rax),%rbp

mov -24(%rax),%r12

@@ -1470,13 +1497,15 @@ $code.=<<___;

.LSEH_info_bn_sqr8x_mont:

.byte 9,0,0,0

.rva sqr_handler

- .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]

+ .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]

+.align 8

___

$code.=<<___ if ($addx);

.LSEH_info_bn_mulx4x_mont:

.byte 9,0,0,0

.rva sqr_handler

- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]

+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]

+.align 8

___

}

diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 938e17081803..f1fbb45b532b 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl

@@ -86,6 +86,8 @@ $code=<<___;

.type bn_mul_mont_gather5,\@function,6

.align 64

bn_mul_mont_gather5:

+ mov ${num}d,${num}d

+ mov %rsp,%rax

test \$7,${num}d

jnz .Lmul_enter

___

@@ -97,10 +99,7 @@ $code.=<<___;

.align 16

.Lmul_enter:

- mov ${num}d,${num}d

- mov %rsp,%rax

movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument

- lea .Linc(%rip),%r10

push %rbx

push %rbp

push %r12

@@ -108,26 +107,36 @@ $code.=<<___;

push %r14

push %r15

- lea 2($num),%r11

- neg %r11

- lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)

- and \$-1024,%rsp # minimize TLB usage

+ neg $num

+ mov %rsp,%r11

+ lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)

+ neg $num # restore $num

+ and \$-1024,%r10 # minimize TLB usage

- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp

-.Lmul_body:

# Some OSes, *cough*-dows, insist on stack being "wired" to

# physical memory in strictly sequential manner, i.e. if stack

# allocation spans two pages, then reference to farmost one can

# be punishable by SEGV. But page walking can do good even on

# other OSes, because it guarantees that villain thread hits

# the guard page before it can make damage to innocent one...

- sub %rsp,%rax

- and \$-4096,%rax

+ sub %r10,%r11

+ and \$-4096,%r11

+ lea (%r10,%r11),%rsp

+ mov (%rsp),%r11

+ cmp %r10,%rsp

+ ja .Lmul_page_walk

+ jmp .Lmul_page_walk_done

.Lmul_page_walk:

- mov (%rsp,%rax),%r11

- sub \$4096,%rax

- .byte 0x2e # predict non-taken

- jnc .Lmul_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r11

+ cmp %r10,%rsp

+ ja .Lmul_page_walk

+.Lmul_page_walk_done:

+ lea .Linc(%rip),%r10

+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp

+.Lmul_body:

lea 128($bp),%r12 # reassign $bp (+size optimization)

___

@@ -433,6 +442,8 @@ $code.=<<___;

.type bn_mul4x_mont_gather5,\@function,6

.align 32

bn_mul4x_mont_gather5:

+ .byte 0x67

+ mov %rsp,%rax

.Lmul4x_enter:

___

$code.=<<___ if ($addx);

@@ -441,14 +452,13 @@ $code.=<<___ if ($addx);

je .Lmulx4x_enter

___

$code.=<<___;

- .byte 0x67

- mov %rsp,%rax

push %rbx

push %rbp

push %r12

push %r13

push %r14

push %r15

+.Lmul4x_prologue:

.byte 0x67

shl \$3,${num}d # convert $num to bytes

@@ -465,32 +475,40 @@ $code.=<<___;

# calculated from 7th argument, the index.]

lea -320(%rsp,$num,2),%r11

+ mov %rsp,%rbp

sub $rp,%r11

and \$4095,%r11

cmp %r11,%r10

jb .Lmul4xsp_alt

- sub %r11,%rsp # align with $rp

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)

+ sub %r11,%rbp # align with $rp

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)

jmp .Lmul4xsp_done

.align 32

.Lmul4xsp_alt:

lea 4096-320(,$num,2),%r10

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)

sub %r10,%r11

mov \$0,%r10

cmovc %r10,%r11

- sub %r11,%rsp

+ sub %r11,%rbp

.Lmul4xsp_done:

- and \$-64,%rsp

- mov %rax,%r11

- sub %rsp,%r11

+ and \$-64,%rbp

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lmul4x_page_walk

+ jmp .Lmul4x_page_walk_done

.Lmul4x_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lmul4x_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lmul4x_page_walk

+.Lmul4x_page_walk_done:

neg $num

@@ -1034,6 +1052,7 @@ $code.=<<___;

.type bn_power5,\@function,6

.align 32

bn_power5:

+ mov %rsp,%rax

___

$code.=<<___ if ($addx);

mov OPENSSL_ia32cap_P+8(%rip),%r11d

@@ -1042,13 +1061,13 @@ $code.=<<___ if ($addx);

je .Lpowerx5_enter

___

$code.=<<___;

- mov %rsp,%rax

push %rbx

push %rbp

push %r12

push %r13

push %r14

push %r15

+.Lpower5_prologue:

shl \$3,${num}d # convert $num to bytes

lea ($num,$num,2),%r10d # 3*$num

@@ -1063,32 +1082,40 @@ $code.=<<___;

# calculated from 7th argument, the index.]

lea -320(%rsp,$num,2),%r11

+ mov %rsp,%rbp

sub $rptr,%r11

and \$4095,%r11

cmp %r11,%r10

jb .Lpwr_sp_alt

- sub %r11,%rsp # align with $aptr

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)

+ sub %r11,%rbp # align with $aptr

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)

jmp .Lpwr_sp_done

.align 32

.Lpwr_sp_alt:

lea 4096-320(,$num,2),%r10

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)

sub %r10,%r11

mov \$0,%r10

cmovc %r10,%r11

- sub %r11,%rsp

+ sub %r11,%rbp

.Lpwr_sp_done:

- and \$-64,%rsp

- mov %rax,%r11

- sub %rsp,%r11

+ and \$-64,%rbp

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lpwr_page_walk

+ jmp .Lpwr_page_walk_done

.Lpwr_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lpwr_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lpwr_page_walk

+.Lpwr_page_walk_done:

mov $num,%r10

neg $num

@@ -2028,6 +2055,7 @@ bn_from_mont8x:

push %r13

push %r14

push %r15

+.Lfrom_prologue:

shl \$3,${num}d # convert $num to bytes

lea ($num,$num,2),%r10 # 3*$num in bytes

@@ -2042,32 +2070,40 @@ bn_from_mont8x:

# last operation, we use the opportunity to cleanse it.

lea -320(%rsp,$num,2),%r11

+ mov %rsp,%rbp

sub $rptr,%r11

and \$4095,%r11

cmp %r11,%r10

jb .Lfrom_sp_alt

- sub %r11,%rsp # align with $aptr

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)

+ sub %r11,%rbp # align with $aptr

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)

jmp .Lfrom_sp_done

.align 32

.Lfrom_sp_alt:

lea 4096-320(,$num,2),%r10

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)

sub %r10,%r11

mov \$0,%r10

cmovc %r10,%r11

- sub %r11,%rsp

+ sub %r11,%rbp

.Lfrom_sp_done:

- and \$-64,%rsp

- mov %rax,%r11

- sub %rsp,%r11

+ and \$-64,%rbp

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lfrom_page_walk

+ jmp .Lfrom_page_walk_done

.Lfrom_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lfrom_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lfrom_page_walk

+.Lfrom_page_walk_done:

mov $num,%r10

neg $num

@@ -2173,14 +2209,15 @@ $code.=<<___;

.type bn_mulx4x_mont_gather5,\@function,6

.align 32

bn_mulx4x_mont_gather5:

-.Lmulx4x_enter:

mov %rsp,%rax

+.Lmulx4x_enter:

push %rbx

push %rbp

push %r12

push %r13

push %r14

push %r15

+.Lmulx4x_prologue:

shl \$3,${num}d # convert $num to bytes

lea ($num,$num,2),%r10 # 3*$num in bytes

@@ -2197,31 +2234,39 @@ bn_mulx4x_mont_gather5:

# calculated from 7th argument, the index.]

lea -320(%rsp,$num,2),%r11

+ mov %rsp,%rbp

sub $rp,%r11

and \$4095,%r11

cmp %r11,%r10

jb .Lmulx4xsp_alt

- sub %r11,%rsp # align with $aptr

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)

+ sub %r11,%rbp # align with $aptr

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)

jmp .Lmulx4xsp_done

.Lmulx4xsp_alt:

lea 4096-320(,$num,2),%r10

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)

sub %r10,%r11

mov \$0,%r10

cmovc %r10,%r11

- sub %r11,%rsp

+ sub %r11,%rbp

.Lmulx4xsp_done:

- and \$-64,%rsp # ensure alignment

- mov %rax,%r11

- sub %rsp,%r11

+ and \$-64,%rbp # ensure alignment

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lmulx4x_page_walk

+ jmp .Lmulx4x_page_walk_done

.Lmulx4x_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lmulx4x_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lmulx4x_page_walk

+.Lmulx4x_page_walk_done:

##############################################################

# Stack layout

@@ -2629,14 +2674,15 @@ $code.=<<___;

.type bn_powerx5,\@function,6

.align 32

bn_powerx5:

-.Lpowerx5_enter:

mov %rsp,%rax

+.Lpowerx5_enter:

push %rbx

push %rbp

push %r12

push %r13

push %r14

push %r15

+.Lpowerx5_prologue:

shl \$3,${num}d # convert $num to bytes

lea ($num,$num,2),%r10 # 3*$num in bytes

@@ -2651,32 +2697,40 @@ bn_powerx5:

# calculated from 7th argument, the index.]

lea -320(%rsp,$num,2),%r11

+ mov %rsp,%rbp

sub $rptr,%r11

and \$4095,%r11

cmp %r11,%r10

jb .Lpwrx_sp_alt

- sub %r11,%rsp # align with $aptr

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)

+ sub %r11,%rbp # align with $aptr

+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)

jmp .Lpwrx_sp_done

.align 32

.Lpwrx_sp_alt:

lea 4096-320(,$num,2),%r10

- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)

+ lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)

sub %r10,%r11

mov \$0,%r10

cmovc %r10,%r11

- sub %r11,%rsp

+ sub %r11,%rbp

.Lpwrx_sp_done:

- and \$-64,%rsp

- mov %rax,%r11

- sub %rsp,%r11

+ and \$-64,%rbp

+ mov %rsp,%r11

+ sub %rbp,%r11

and \$-4096,%r11

+ lea (%rbp,%r11),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lpwrx_page_walk

+ jmp .Lpwrx_page_walk_done

.Lpwrx_page_walk:

- mov (%rsp,%r11),%r10

- sub \$4096,%r11

- .byte 0x2e # predict non-taken

- jnc .Lpwrx_page_walk

+ lea -4096(%rsp),%rsp

+ mov (%rsp),%r10

+ cmp %rbp,%rsp

+ ja .Lpwrx_page_walk

+.Lpwrx_page_walk_done:

mov $num,%r10

neg $num

@@ -3607,9 +3661,14 @@ mul_handler:

cmp %r10,%rbx # context->Rip<end of prologue label

jb .Lcommon_seh_tail

+ mov 4(%r11),%r10d # HandlerData[1]

+ lea (%rsi,%r10),%r10 # epilogue label

+ cmp %r10,%rbx # context->Rip>=epilogue label

+ jb .Lcommon_pop_regs

mov 152($context),%rax # pull context->Rsp

- mov 4(%r11),%r10d # HandlerData[1]

+ mov 8(%r11),%r10d # HandlerData[2]

lea (%rsi,%r10),%r10 # epilogue label

cmp %r10,%rbx # context->Rip>=epilogue label

jae .Lcommon_seh_tail

@@ -3621,11 +3680,11 @@ mul_handler:

mov 192($context),%r10 # pull $num

mov 8(%rax,%r10,8),%rax # pull saved stack pointer

- jmp .Lbody_proceed

+ jmp .Lcommon_pop_regs

.Lbody_40:

mov 40(%rax),%rax # pull saved stack pointer

-.Lbody_proceed:

+.Lcommon_pop_regs:

mov -8(%rax),%rbx

mov -16(%rax),%rbp

mov -24(%rax),%r12

@@ -3716,34 +3775,34 @@ $code.=<<___;

.LSEH_info_bn_mul_mont_gather5:

.byte 9,0,0,0

.rva mul_handler

- .rva .Lmul_body,.Lmul_epilogue # HandlerData[]

+ .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]

.align 8

.LSEH_info_bn_mul4x_mont_gather5:

.byte 9,0,0,0

.rva mul_handler

- .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]

+ .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]

.align 8

.LSEH_info_bn_power5:

.byte 9,0,0,0

.rva mul_handler

- .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]

+ .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]

.align 8

.LSEH_info_bn_from_mont8x:

.byte 9,0,0,0

.rva mul_handler

- .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]

+ .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]

___

$code.=<<___ if ($addx);

.align 8

.LSEH_info_bn_mulx4x_mont_gather5:

.byte 9,0,0,0

.rva mul_handler

- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]

+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]

.align 8

.LSEH_info_bn_powerx5:

.byte 9,0,0,0

.rva mul_handler

- .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]

+ .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]

___

$code.=<<___;

.align 8

diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index 86264ae6315f..633d1b1f6013 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h

@@ -842,6 +842,8 @@ int RAND_pseudo_bytes(unsigned char *buf, int num);

if (*(ftl--)) break; \

(a)->top = tmp_top; \

} \

+ if ((a)->top == 0) \

+ (a)->neg = 0; \

bn_pollute(a); \

}

diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 72e6ce3f74c0..bc37671cf138 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c

@@ -155,7 +155,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,

({ asm volatile ( \

"divl %4" \

: "=a"(q), "=d"(rem) \

- : "a"(n1), "d"(n0), "g"(d0) \

+ : "a"(n1), "d"(n0), "r"(d0) \

: "cc"); \

q; \

})

@@ -170,7 +170,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,

({ asm volatile ( \

"divq %4" \

: "=a"(q), "=d"(rem) \

- : "a"(n1), "d"(n0), "g"(d0) \

+ : "a"(n1), "d"(n0), "r"(d0) \

: "cc"); \

q; \

})

diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index 80105fff410c..10b78f512607 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c

@@ -569,7 +569,7 @@ void BN_clear(BIGNUM *a)

{

bn_check_top(a);

if (a->d != NULL)

- memset(a->d, 0, a->dmax * sizeof(a->d[0]));

+ OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));

a->top = 0;

a->neg = 0;

}

diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index bfa31efc5621..f121fb6e9a08 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c

@@ -72,12 +72,9 @@ char *BN_bn2hex(const BIGNUM *a)

char *buf;

char *p;

- if (a->neg && BN_is_zero(a)) {

- /* "-0" == 3 bytes including NULL terminator */

- buf = OPENSSL_malloc(3);

- } else {

- buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);

- }

+ if (BN_is_zero(a))

+ return OPENSSL_strdup("0");

+ buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);

if (buf == NULL) {

BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE);

goto err;

@@ -111,6 +108,7 @@ char *BN_bn2dec(const BIGNUM *a)

char *p;

BIGNUM *t = NULL;

BN_ULONG *bn_data = NULL, *lp;

+ int bn_data_num;

/*-

* get an upper bound for the length of the decimal integer

@@ -120,9 +118,9 @@ char *BN_bn2dec(const BIGNUM *a)

i = BN_num_bits(a) * 3;

num = (i / 10 + i / 1000 + 1) + 1;

- bn_data =

- (BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG));

- buf = (char *)OPENSSL_malloc(num + 3);

+ bn_data_num = num / BN_DEC_NUM + 1;

+ bn_data = OPENSSL_malloc(bn_data_num * sizeof(BN_ULONG));

+ buf = OPENSSL_malloc(num + 3);

if ((buf == NULL) || (bn_data == NULL)) {

BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE);

goto err;

@@ -140,9 +138,12 @@ char *BN_bn2dec(const BIGNUM *a)

if (BN_is_negative(t))

*p++ = '-';

- i = 0;

while (!BN_is_zero(t)) {

+ if (lp - bn_data >= bn_data_num)

+ goto err;

*lp = BN_div_word(t, BN_DEC_CONV);

+ if (*lp == (BN_ULONG)-1)

+ goto err;

lp++;

}

lp--;

@@ -240,10 +241,12 @@ int BN_hex2bn(BIGNUM **bn, const char *a)

}

ret->top = h;

bn_correct_top(ret);

- ret->neg = neg;

*bn = ret;

bn_check_top(ret);

+ /* Don't set the negative flag if it's zero. */

+ if (ret->top != 0)

+ ret->neg = neg;

return (num);

err:

if (*bn == NULL)

@@ -295,7 +298,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)

if (j == BN_DEC_NUM)

j = 0;

l = 0;

- while (*a) {

+ while (--i >= 0) {

l *= 10;

l += *a - '0';

a++;

@@ -306,11 +309,13 @@ int BN_dec2bn(BIGNUM **bn, const char *a)

j = 0;

}

- ret->neg = neg;

bn_correct_top(ret);

*bn = ret;

bn_check_top(ret);

+ /* Don't set the negative flag if it's zero. */

+ if (ret->top != 0)

+ ret->neg = neg;

return (num);

err:

if (*bn == NULL)

@@ -321,6 +326,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a)

int BN_asc2bn(BIGNUM **bn, const char *a)

{

const char *p = a;

if (*p == '-')

p++;

@@ -331,7 +337,8 @@ int BN_asc2bn(BIGNUM **bn, const char *a)

if (!BN_dec2bn(bn, p))

return 0;

}

- if (*a == '-')

+ /* Don't set the negative flag if it's zero. */

+ if (*a == '-' && (*bn)->top != 0)

(*bn)->neg = 1;

return 1;

}

diff --git a/crypto/bn/bn_rand.c b/crypto/bn/bn_rand.c
index f9fb2e9e45e0..60d3f2260ba1 100644
--- a/crypto/bn/bn_rand.c
+++ b/crypto/bn/bn_rand.c

@@ -121,15 +121,14 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)

int ret = 0, bit, bytes, mask;

time_t tim;

- if (bits < 0 || (bits == 1 && top > 0)) {

- BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);

- return 0;

- }

if (bits == 0) {

+ if (top != -1 || bottom != 0)

+ goto toosmall;

BN_zero(rnd);

return 1;

}

+ if (bits < 0 || (bits == 1 && top > 0))

+ goto toosmall;

bytes = (bits + 7) / 8;

bit = (bits - 1) % 8;

@@ -145,13 +144,9 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)

time(&tim);

RAND_add(&tim, sizeof(tim), 0.0);

- if (pseudorand) {

- if (RAND_pseudo_bytes(buf, bytes) == -1)

- goto err;

- } else {

- if (RAND_bytes(buf, bytes) <= 0)

- goto err;

- }

+ /* We ignore the value of pseudorand and always call RAND_bytes */

+ if (RAND_bytes(buf, bytes) <= 0)

+ goto err;

#if 1

if (pseudorand == 2) {

@@ -199,6 +194,10 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)

}

bn_check_top(rnd);

return (ret);

+toosmall:

+ BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);

+ return 0;

}

int BN_rand(BIGNUM *rnd, int bits, int top, int bottom)

diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c
index b031a60b5bf8..9b5f9cb98c3a 100644
--- a/crypto/bn/bn_word.c
+++ b/crypto/bn/bn_word.c

@@ -72,10 +72,32 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)

if (w == 0)

return (BN_ULONG)-1;

+#ifndef BN_LLONG

+ /*

+ * If |w| is too long and we don't have BN_ULLONG then we need to fall

+ * back to using BN_div_word

+ */

+ if (w > ((BN_ULONG)1 << BN_BITS4)) {

+ BIGNUM *tmp = BN_dup(a);

+ if (tmp == NULL)

+ return (BN_ULONG)-1;

+ ret = BN_div_word(tmp, w);

+ BN_free(tmp);

+ return ret;

+ }

+#endif

bn_check_top(a);

w &= BN_MASK2;

for (i = a->top - 1; i >= 0; i--) {

#ifndef BN_LLONG

+ /*

+ * We can assume here that | w <= ((BN_ULONG)1 << BN_BITS4) | and so

+ * | ret < ((BN_ULONG)1 << BN_BITS4) | and therefore the shifts here are

+ * safe and will not overflow

+ */

ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;

ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;

#else

diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 1e35988022bb..a327b1a647b2 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c

@@ -514,7 +514,7 @@ static void print_word(BIO *bp, BN_ULONG w)

int test_div_word(BIO *bp)

{

BIGNUM a, b;

- BN_ULONG r, s;

+ BN_ULONG r, rmod, s;

int i;

BN_init(&a);

@@ -528,8 +528,14 @@ int test_div_word(BIO *bp)

s = b.d[0];

BN_copy(&b, &a);

+ rmod = BN_mod_word(&b, s);

r = BN_div_word(&b, s);

+ if (rmod != r) {

+ fprintf(stderr, "Mod (word) test failed!\n");

+ return 0;

+ }

if (bp != NULL) {

if (!results) {

BN_print(bp, &a);