diff options
author | Jung-uk Kim <jkim@FreeBSD.org> | 2016-03-01 17:57:01 +0000 |
---|---|---|
committer | Jung-uk Kim <jkim@FreeBSD.org> | 2016-03-01 17:57:01 +0000 |
commit | 9aeed18ad799c20d3accf6e1535817538dc983f6 (patch) | |
tree | 37a4bb1290ee86a2b4ce070f139b2379ee747425 /crypto | |
parent | c188d4cade9cba451816aef2371942bea4ff837f (diff) | |
download | src-9aeed18ad799c20d3accf6e1535817538dc983f6.tar.gz src-9aeed18ad799c20d3accf6e1535817538dc983f6.zip |
Import OpenSSL 1.0.2g.vendor/openssl/1.0.2g
Notes
Notes:
svn path=/vendor-crypto/openssl/dist/; revision=296273
svn path=/vendor-crypto/openssl/1.0.2g/; revision=296274; tag=vendor/openssl/1.0.2g
Diffstat (limited to 'crypto')
40 files changed, 1741 insertions, 1046 deletions
diff --git a/crypto/asn1/tasn_dec.c b/crypto/asn1/tasn_dec.c index 9256049d1588..5a507967c894 100644 --- a/crypto/asn1/tasn_dec.c +++ b/crypto/asn1/tasn_dec.c @@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, long plen; char cst, inf, free_cont = 0; const unsigned char *p; - BUF_MEM buf; + BUF_MEM buf = { 0, NULL, 0 }; const unsigned char *cont = NULL; long len; if (!pval) { @@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, } else { len = p - cont + plen; p += plen; - buf.data = NULL; } } else if (cst) { if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN @@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE); return 0; } - buf.length = 0; - buf.max = 0; - buf.data = NULL; + + /* Free any returned 'buf' content */ + free_cont = 1; /* * Should really check the internal tags are correct but some things * may get this wrong. The relevant specs say that constructed string @@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, * So instead just check for UNIVERSAL class and ignore the tag. */ if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) { - free_cont = 1; goto err; } len = buf.length; /* Append a final null to string */ if (!BUF_MEM_grow_clean(&buf, len + 1)) { ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE); - return 0; + goto err; } buf.data[len] = 0; cont = (const unsigned char *)buf.data; - free_cont = 1; } else { cont = p; len = plen; @@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, } /* We now have content length and type: translate into a structure */ + /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */ if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it)) goto err; diff --git a/crypto/bio/b_print.c b/crypto/bio/b_print.c index 7c81e25d482c..90248fa2aaba 100644 --- a/crypto/bio/b_print.c +++ b/crypto/bio/b_print.c @@ -125,16 +125,16 @@ # define LLONG long #endif -static void fmtstr(char **, char **, size_t *, size_t *, - const char *, int, int, int); -static void fmtint(char **, char **, size_t *, size_t *, - LLONG, int, int, int, int); -static void fmtfp(char **, char **, size_t *, size_t *, - LDOUBLE, int, int, int); -static void doapr_outch(char **, char **, size_t *, size_t *, int); -static void _dopr(char **sbuffer, char **buffer, - size_t *maxlen, size_t *retlen, int *truncated, - const char *format, va_list args); +static int fmtstr(char **, char **, size_t *, size_t *, + const char *, int, int, int); +static int fmtint(char **, char **, size_t *, size_t *, + LLONG, int, int, int, int); +static int fmtfp(char **, char **, size_t *, size_t *, + LDOUBLE, int, int, int); +static int doapr_outch(char **, char **, size_t *, size_t *, int); +static int _dopr(char **sbuffer, char **buffer, + size_t *maxlen, size_t *retlen, int *truncated, + const char *format, va_list args); /* format read states */ #define DP_S_DEFAULT 0 @@ -165,7 +165,7 @@ static void _dopr(char **sbuffer, char **buffer, #define char_to_int(p) (p - '0') #define OSSL_MAX(p,q) ((p >= q) ? p : q) -static void +static int _dopr(char **sbuffer, char **buffer, size_t *maxlen, @@ -196,7 +196,8 @@ _dopr(char **sbuffer, if (ch == '%') state = DP_S_FLAGS; else - doapr_outch(sbuffer, buffer, &currlen, maxlen, ch); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch)) + return 0; ch = *format++; break; case DP_S_FLAGS: @@ -302,8 +303,9 @@ _dopr(char **sbuffer, value = va_arg(args, int); break; } - fmtint(sbuffer, buffer, &currlen, maxlen, - value, 10, min, max, flags); + if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min, + max, flags)) + return 0; break; case 'X': flags |= DP_F_UP; @@ -326,17 +328,19 @@ _dopr(char **sbuffer, value = (LLONG) va_arg(args, unsigned int); break; } - fmtint(sbuffer, buffer, &currlen, maxlen, value, - ch == 'o' ? 8 : (ch == 'u' ? 10 : 16), - min, max, flags); + if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, + ch == 'o' ? 8 : (ch == 'u' ? 10 : 16), + min, max, flags)) + return 0; break; case 'f': if (cflags == DP_C_LDOUBLE) fvalue = va_arg(args, LDOUBLE); else fvalue = va_arg(args, double); - fmtfp(sbuffer, buffer, &currlen, maxlen, - fvalue, min, max, flags); + if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max, + flags)) + return 0; break; case 'E': flags |= DP_F_UP; @@ -355,8 +359,9 @@ _dopr(char **sbuffer, fvalue = va_arg(args, double); break; case 'c': - doapr_outch(sbuffer, buffer, &currlen, maxlen, - va_arg(args, int)); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, + va_arg(args, int))) + return 0; break; case 's': strvalue = va_arg(args, char *); @@ -366,13 +371,15 @@ _dopr(char **sbuffer, else max = *maxlen; } - fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue, - flags, min, max); + if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue, + flags, min, max)) + return 0; break; case 'p': value = (long)va_arg(args, void *); - fmtint(sbuffer, buffer, &currlen, maxlen, - value, 16, min, max, flags | DP_F_NUM); + if (!fmtint(sbuffer, buffer, &currlen, maxlen, + value, 16, min, max, flags | DP_F_NUM)) + return 0; break; case 'n': /* XXX */ if (cflags == DP_C_SHORT) { @@ -394,7 +401,8 @@ _dopr(char **sbuffer, } break; case '%': - doapr_outch(sbuffer, buffer, &currlen, maxlen, ch); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch)) + return 0; break; case 'w': /* not supported yet, treat as next char */ @@ -418,46 +426,56 @@ _dopr(char **sbuffer, *truncated = (currlen > *maxlen - 1); if (*truncated) currlen = *maxlen - 1; - doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0')) + return 0; *retlen = currlen - 1; - return; + return 1; } -static void +static int fmtstr(char **sbuffer, char **buffer, size_t *currlen, size_t *maxlen, const char *value, int flags, int min, int max) { - int padlen, strln; + int padlen; + size_t strln; int cnt = 0; if (value == 0) value = "<NULL>"; - for (strln = 0; value[strln]; ++strln) ; + + strln = strlen(value); + if (strln > INT_MAX) + strln = INT_MAX; + padlen = min - strln; - if (padlen < 0) + if (min < 0 || padlen < 0) padlen = 0; if (flags & DP_F_MINUS) padlen = -padlen; while ((padlen > 0) && (cnt < max)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; --padlen; ++cnt; } while (*value && (cnt < max)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, *value++); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++)) + return 0; ++cnt; } while ((padlen < 0) && (cnt < max)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; ++padlen; ++cnt; } + return 1; } -static void +static int fmtint(char **sbuffer, char **buffer, size_t *currlen, @@ -517,37 +535,44 @@ fmtint(char **sbuffer, /* spaces */ while (spadlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; --spadlen; } /* sign */ if (signvalue) - doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue)) + return 0; /* prefix */ while (*prefix) { - doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix)) + return 0; prefix++; } /* zeros */ if (zpadlen > 0) { while (zpadlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '0'); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0')) + return 0; --zpadlen; } } /* digits */ - while (place > 0) - doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]); + while (place > 0) { + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place])) + return 0; + } /* left justified spaces */ while (spadlen < 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; ++spadlen; } - return; + return 1; } static LDOUBLE abs_val(LDOUBLE value) @@ -578,7 +603,7 @@ static long roundv(LDOUBLE value) return intpart; } -static void +static int fmtfp(char **sbuffer, char **buffer, size_t *currlen, @@ -657,47 +682,61 @@ fmtfp(char **sbuffer, if ((flags & DP_F_ZERO) && (padlen > 0)) { if (signvalue) { - doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue)) + return 0; --padlen; signvalue = 0; } while (padlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '0'); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0')) + return 0; --padlen; } } while (padlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; --padlen; } - if (signvalue) - doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue); + if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue)) + return 0; - while (iplace > 0) - doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]); + while (iplace > 0) { + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace])) + return 0; + } /* * Decimal point. This should probably use locale to find the correct * char to print out. */ if (max > 0 || (flags & DP_F_NUM)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '.'); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.')) + return 0; - while (fplace > 0) - doapr_outch(sbuffer, buffer, currlen, maxlen, fconvert[--fplace]); + while (fplace > 0) { + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, + fconvert[--fplace])) + return 0; + } } while (zpadlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '0'); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0')) + return 0; --zpadlen; } while (padlen < 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; ++padlen; } + return 1; } -static void +#define BUFFER_INC 1024 + +static int doapr_outch(char **sbuffer, char **buffer, size_t *currlen, size_t *maxlen, int c) { @@ -708,24 +747,25 @@ doapr_outch(char **sbuffer, assert(*currlen <= *maxlen); if (buffer && *currlen == *maxlen) { - *maxlen += 1024; + if (*maxlen > INT_MAX - BUFFER_INC) + return 0; + + *maxlen += BUFFER_INC; if (*buffer == NULL) { *buffer = OPENSSL_malloc(*maxlen); - if (!*buffer) { - /* Panic! Can't really do anything sensible. Just return */ - return; - } + if (*buffer == NULL) + return 0; if (*currlen > 0) { assert(*sbuffer != NULL); memcpy(*buffer, *sbuffer, *currlen); } *sbuffer = NULL; } else { - *buffer = OPENSSL_realloc(*buffer, *maxlen); - if (!*buffer) { - /* Panic! Can't really do anything sensible. Just return */ - return; - } + char *tmpbuf; + tmpbuf = OPENSSL_realloc(*buffer, *maxlen); + if (tmpbuf == NULL) + return 0; + *buffer = tmpbuf; } } @@ -736,7 +776,7 @@ doapr_outch(char **sbuffer, (*buffer)[(*currlen)++] = (char)c; } - return; + return 1; } /***************************************************************************/ @@ -768,7 +808,11 @@ int BIO_vprintf(BIO *bio, const char *format, va_list args) dynbuf = NULL; CRYPTO_push_info("doapr()"); - _dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, args); + if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, + args)) { + OPENSSL_free(dynbuf); + return -1; + } if (dynbuf) { ret = BIO_write(bio, dynbuf, (int)retlen); OPENSSL_free(dynbuf); @@ -803,7 +847,8 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args) size_t retlen; int truncated; - _dopr(&buf, NULL, &n, &retlen, &truncated, format, args); + if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args)) + return -1; if (truncated) /* diff --git a/crypto/bio/bio.h b/crypto/bio/bio.h index 6e2293bc66da..6790aed28e0b 100644 --- a/crypto/bio/bio.h +++ b/crypto/bio/bio.h @@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo { # define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0) # define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1) # define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2) -# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL) +# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL) # define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL) @@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi, long argl, long ret); BIO_METHOD *BIO_s_mem(void); -BIO *BIO_new_mem_buf(void *buf, int len); +BIO *BIO_new_mem_buf(const void *buf, int len); BIO_METHOD *BIO_s_socket(void); BIO_METHOD *BIO_s_connect(void); BIO_METHOD *BIO_s_accept(void); diff --git a/crypto/bio/bss_mem.c b/crypto/bio/bss_mem.c index d190765dc201..b0394a960da1 100644 --- a/crypto/bio/bss_mem.c +++ b/crypto/bio/bss_mem.c @@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void) return (&mem_method); } -BIO *BIO_new_mem_buf(void *buf, int len) + +BIO *BIO_new_mem_buf(const void *buf, int len) { BIO *ret; BUF_MEM *b; @@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len) if (!(ret = BIO_new(BIO_s_mem()))) return NULL; b = (BUF_MEM *)ret->ptr; - b->data = buf; + /* Cast away const and trust in the MEM_RDONLY flag. */ + b->data = (void *)buf; b->length = sz; b->max = sz; ret->flags |= BIO_FLAGS_MEM_RDONLY; diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index 215855ecae91..c4c640951759 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -252,8 +252,8 @@ bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h -bn_exp.o: rsaz_exp.h +bn_exp.o: ../../include/openssl/symhacks.h ../constant_time_locl.h +bn_exp.o: ../cryptlib.h bn_exp.c bn_lcl.h rsaz_exp.h bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl index 3b6ccf83d13e..712a77fe8ca3 100755 --- a/crypto/bn/asm/rsaz-avx2.pl +++ b/crypto/bn/asm/rsaz-avx2.pl @@ -443,7 +443,7 @@ $TEMP2 = $B2; $TEMP3 = $Y1; $TEMP4 = $Y2; $code.=<<___; - #we need to fix indexes 32-39 to avoid overflow + # we need to fix indices 32-39 to avoid overflow vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) @@ -1592,68 +1592,128 @@ rsaz_1024_scatter5_avx2: .type rsaz_1024_gather5_avx2,\@abi-omnipotent .align 32 rsaz_1024_gather5_avx2: + vzeroupper + mov %rsp,%r11 ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax - vzeroupper .LSEH_begin_rsaz_1024_gather5: # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp - .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) - .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) - .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) - .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) - .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) - .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) - .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) - .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) - .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) - .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) ___ $code.=<<___; - lea .Lgather_table(%rip),%r11 - mov $power,%eax - and \$3,$power - shr \$2,%eax # cache line number - shl \$4,$power # offset within cache line - - vmovdqu -32(%r11),%ymm7 # .Lgather_permd - vpbroadcastb 8(%r11,%rax), %xmm8 - vpbroadcastb 7(%r11,%rax), %xmm9 - vpbroadcastb 6(%r11,%rax), %xmm10 - vpbroadcastb 5(%r11,%rax), %xmm11 - vpbroadcastb 4(%r11,%rax), %xmm12 - vpbroadcastb 3(%r11,%rax), %xmm13 - vpbroadcastb 2(%r11,%rax), %xmm14 - vpbroadcastb 1(%r11,%rax), %xmm15 - - lea 64($inp,$power),$inp - mov \$64,%r11 # size optimization - mov \$9,%eax - jmp .Loop_gather_1024 + lea -0x100(%rsp),%rsp + and \$-32, %rsp + lea .Linc(%rip), %r10 + lea -128(%rsp),%rax # control u-op density + + vmovd $power, %xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5, %ymm0, %ymm2 + vpcmpeqd %ymm4, %ymm0, %ymm0 + vpaddd %ymm5, %ymm1, %ymm3 + vpcmpeqd %ymm4, %ymm1, %ymm1 + vmovdqa %ymm0, 32*0+128(%rax) + vpaddd %ymm5, %ymm2, %ymm0 + vpcmpeqd %ymm4, %ymm2, %ymm2 + vmovdqa %ymm1, 32*1+128(%rax) + vpaddd %ymm5, %ymm3, %ymm1 + vpcmpeqd %ymm4, %ymm3, %ymm3 + vmovdqa %ymm2, 32*2+128(%rax) + vpaddd %ymm5, %ymm0, %ymm2 + vpcmpeqd %ymm4, %ymm0, %ymm0 + vmovdqa %ymm3, 32*3+128(%rax) + vpaddd %ymm5, %ymm1, %ymm3 + vpcmpeqd %ymm4, %ymm1, %ymm1 + vmovdqa %ymm0, 32*4+128(%rax) + vpaddd %ymm5, %ymm2, %ymm8 + vpcmpeqd %ymm4, %ymm2, %ymm2 + vmovdqa %ymm1, 32*5+128(%rax) + vpaddd %ymm5, %ymm3, %ymm9 + vpcmpeqd %ymm4, %ymm3, %ymm3 + vmovdqa %ymm2, 32*6+128(%rax) + vpaddd %ymm5, %ymm8, %ymm10 + vpcmpeqd %ymm4, %ymm8, %ymm8 + vmovdqa %ymm3, 32*7+128(%rax) + vpaddd %ymm5, %ymm9, %ymm11 + vpcmpeqd %ymm4, %ymm9, %ymm9 + vpaddd %ymm5, %ymm10, %ymm12 + vpcmpeqd %ymm4, %ymm10, %ymm10 + vpaddd %ymm5, %ymm11, %ymm13 + vpcmpeqd %ymm4, %ymm11, %ymm11 + vpaddd %ymm5, %ymm12, %ymm14 + vpcmpeqd %ymm4, %ymm12, %ymm12 + vpaddd %ymm5, %ymm13, %ymm15 + vpcmpeqd %ymm4, %ymm13, %ymm13 + vpcmpeqd %ymm4, %ymm14, %ymm14 + vpcmpeqd %ymm4, %ymm15, %ymm15 + + vmovdqa -32(%r10),%ymm7 # .Lgather_permd + lea 128($inp), $inp + mov \$9,$power -.align 32 .Loop_gather_1024: - vpand -64($inp), %xmm8,%xmm0 - vpand ($inp), %xmm9,%xmm1 - vpand 64($inp), %xmm10,%xmm2 - vpand ($inp,%r11,2), %xmm11,%xmm3 - vpor %xmm0,%xmm1,%xmm1 - vpand 64($inp,%r11,2), %xmm12,%xmm4 - vpor %xmm2,%xmm3,%xmm3 - vpand ($inp,%r11,4), %xmm13,%xmm5 - vpor %xmm1,%xmm3,%xmm3 - vpand 64($inp,%r11,4), %xmm14,%xmm6 - vpor %xmm4,%xmm5,%xmm5 - vpand -128($inp,%r11,8), %xmm15,%xmm2 - lea ($inp,%r11,8),$inp - vpor %xmm3,%xmm5,%xmm5 - vpor %xmm2,%xmm6,%xmm6 - vpor %xmm5,%xmm6,%xmm6 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqu %ymm6,($out) + vmovdqa 32*0-128($inp), %ymm0 + vmovdqa 32*1-128($inp), %ymm1 + vmovdqa 32*2-128($inp), %ymm2 + vmovdqa 32*3-128($inp), %ymm3 + vpand 32*0+128(%rax), %ymm0, %ymm0 + vpand 32*1+128(%rax), %ymm1, %ymm1 + vpand 32*2+128(%rax), %ymm2, %ymm2 + vpor %ymm0, %ymm1, %ymm4 + vpand 32*3+128(%rax), %ymm3, %ymm3 + vmovdqa 32*4-128($inp), %ymm0 + vmovdqa 32*5-128($inp), %ymm1 + vpor %ymm2, %ymm3, %ymm5 + vmovdqa 32*6-128($inp), %ymm2 + vmovdqa 32*7-128($inp), %ymm3 + vpand 32*4+128(%rax), %ymm0, %ymm0 + vpand 32*5+128(%rax), %ymm1, %ymm1 + vpand 32*6+128(%rax), %ymm2, %ymm2 + vpor %ymm0, %ymm4, %ymm4 + vpand 32*7+128(%rax), %ymm3, %ymm3 + vpand 32*8-128($inp), %ymm8, %ymm0 + vpor %ymm1, %ymm5, %ymm5 + vpand 32*9-128($inp), %ymm9, %ymm1 + vpor %ymm2, %ymm4, %ymm4 + vpand 32*10-128($inp),%ymm10, %ymm2 + vpor %ymm3, %ymm5, %ymm5 + vpand 32*11-128($inp),%ymm11, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpand 32*12-128($inp),%ymm12, %ymm0 + vpor %ymm1, %ymm5, %ymm5 + vpand 32*13-128($inp),%ymm13, %ymm1 + vpor %ymm2, %ymm4, %ymm4 + vpand 32*14-128($inp),%ymm14, %ymm2 + vpor %ymm3, %ymm5, %ymm5 + vpand 32*15-128($inp),%ymm15, %ymm3 + lea 32*16($inp), $inp + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm4, %ymm4 + vpor %ymm3, %ymm5, %ymm5 + + vpor %ymm5, %ymm4, %ymm4 + vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared + vpor %xmm4, %xmm5, %xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,($out) lea 32($out),$out - dec %eax + dec $power jnz .Loop_gather_1024 vpxor %ymm0,%ymm0,%ymm0 @@ -1661,20 +1721,20 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - movaps 0x60(%rsp),%xmm12 - movaps 0x70(%rsp),%xmm13 - movaps 0x80(%rsp),%xmm14 - movaps 0x90(%rsp),%xmm15 - lea 0xa8(%rsp),%rsp + movaps -0xa8(%r11),%xmm6 + movaps -0x98(%r11),%xmm7 + movaps -0x88(%r11),%xmm8 + movaps -0x78(%r11),%xmm9 + movaps -0x68(%r11),%xmm10 + movaps -0x58(%r11),%xmm11 + movaps -0x48(%r11),%xmm12 + movaps -0x38(%r11),%xmm13 + movaps -0x28(%r11),%xmm14 + movaps -0x18(%r11),%xmm15 .LSEH_end_rsaz_1024_gather5: ___ $code.=<<___; + lea (%r11),%rsp ret .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ @@ -1708,8 +1768,10 @@ $code.=<<___; .long 0,2,4,6,7,7,7,7 .Lgather_permd: .long 0,7,1,7,2,7,3,7 -.Lgather_table: - .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 +.Linc: + .long 0,0,0,0, 1,1,1,1 + .long 2,2,2,2, 3,3,3,3 + .long 4,4,4,4, 4,4,4,4 .align 64 ___ @@ -1837,18 +1899,19 @@ rsaz_se_handler: .rva rsaz_se_handler .rva .Lmul_1024_body,.Lmul_1024_epilogue .LSEH_info_rsaz_1024_gather5: - .byte 0x01,0x33,0x16,0x00 - .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 - .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 - .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 - .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 - .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 - .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 - .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 - .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 - .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 - .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 - .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 + .byte 0x01,0x36,0x17,0x0b + .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 + .byte 0x00,0xb3,0x00,0x00 # set_frame r11 ___ } diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl index 091cdc2069da..87ce2c34d90c 100755 --- a/crypto/bn/asm/rsaz-x86_64.pl +++ b/crypto/bn/asm/rsaz-x86_64.pl @@ -915,9 +915,76 @@ rsaz_512_mul_gather4: push %r14 push %r15 - mov $pwr, $pwr - subq \$128+24, %rsp + subq \$`128+24+($win64?0xb0:0)`, %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,0xa0(%rsp) + movaps %xmm7,0xb0(%rsp) + movaps %xmm8,0xc0(%rsp) + movaps %xmm9,0xd0(%rsp) + movaps %xmm10,0xe0(%rsp) + movaps %xmm11,0xf0(%rsp) + movaps %xmm12,0x100(%rsp) + movaps %xmm13,0x110(%rsp) + movaps %xmm14,0x120(%rsp) + movaps %xmm15,0x130(%rsp) +___ +$code.=<<___; .Lmul_gather4_body: + movd $pwr,%xmm8 + movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 + movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 + + pshufd \$0,%xmm8,%xmm8 # broadcast $power + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..15 to $power +# +for($i=0;$i<4;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` + movdqa %xmm7,%xmm`$i+3` +___ +} +for(;$i<7;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` +___ +} +$code.=<<___; + pcmpeqd %xmm8,%xmm7 + + movdqa 16*0($bp),%xmm8 + movdqa 16*1($bp),%xmm9 + movdqa 16*2($bp),%xmm10 + movdqa 16*3($bp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4($bp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5($bp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6($bp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7($bp),%xmm15 + leaq 128($bp), %rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d @@ -926,45 +993,38 @@ $code.=<<___ if ($addx); je .Lmulx_gather ___ $code.=<<___; - movl 64($bp,$pwr,4), %eax - movq $out, %xmm0 # off-load arguments - movl ($bp,$pwr,4), %ebx - movq $mod, %xmm1 - movq $n0, 128(%rsp) + movq %xmm8,%rbx + + movq $n0, 128(%rsp) # off-load arguments + movq $out, 128+8(%rsp) + movq $mod, 128+16(%rsp) - shlq \$32, %rax - or %rax, %rbx movq ($ap), %rax movq 8($ap), %rcx - leaq 128($bp,$pwr,4), %rbp mulq %rbx # 0 iteration movq %rax, (%rsp) movq %rcx, %rax movq %rdx, %r8 mulq %rbx - movd (%rbp), %xmm4 addq %rax, %r8 movq 16($ap), %rax movq %rdx, %r9 adcq \$0, %r9 mulq %rbx - movd 64(%rbp), %xmm5 addq %rax, %r9 movq 24($ap), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx - pslldq \$4, %xmm5 addq %rax, %r10 movq 32($ap), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx - por %xmm5, %xmm4 addq %rax, %r11 movq 40($ap), %rax movq %rdx, %r12 @@ -977,14 +1037,12 @@ $code.=<<___; adcq \$0, %r13 mulq %rbx - leaq 128(%rbp), %rbp addq %rax, %r13 movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx - movq %xmm4, %rbx addq %rax, %r14 movq ($ap), %rax movq %rdx, %r15 @@ -996,6 +1054,35 @@ $code.=<<___; .align 32 .Loop_mul_gather: + movdqa 16*0(%rbp),%xmm8 + movdqa 16*1(%rbp),%xmm9 + movdqa 16*2(%rbp),%xmm10 + movdqa 16*3(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7(%rbp),%xmm15 + leaq 128(%rbp), %rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,%rbx + mulq %rbx addq %rax, %r8 movq 8($ap), %rax @@ -1004,7 +1091,6 @@ $code.=<<___; adcq \$0, %r8 mulq %rbx - movd (%rbp), %xmm4 addq %rax, %r9 movq 16($ap), %rax adcq \$0, %rdx @@ -1013,7 +1099,6 @@ $code.=<<___; adcq \$0, %r9 mulq %rbx - movd 64(%rbp), %xmm5 addq %rax, %r10 movq 24($ap), %rax adcq \$0, %rdx @@ -1022,7 +1107,6 @@ $code.=<<___; adcq \$0, %r10 mulq %rbx - pslldq \$4, %xmm5 addq %rax, %r11 movq 32($ap), %rax adcq \$0, %rdx @@ -1031,7 +1115,6 @@ $code.=<<___; adcq \$0, %r11 mulq %rbx - por %xmm5, %xmm4 addq %rax, %r12 movq 40($ap), %rax adcq \$0, %rdx @@ -1056,7 +1139,6 @@ $code.=<<___; adcq \$0, %r14 mulq %rbx - movq %xmm4, %rbx addq %rax, %r15 movq ($ap), %rax adcq \$0, %rdx @@ -1064,7 +1146,6 @@ $code.=<<___; movq %rdx, %r15 adcq \$0, %r15 - leaq 128(%rbp), %rbp leaq 8(%rdi), %rdi decl %ecx @@ -1079,8 +1160,8 @@ $code.=<<___; movq %r14, 48(%rdi) movq %r15, 56(%rdi) - movq %xmm0, $out - movq %xmm1, %rbp + movq 128+8(%rsp), $out + movq 128+16(%rsp), %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 @@ -1098,45 +1179,37 @@ $code.=<<___ if ($addx); .align 32 .Lmulx_gather: - mov 64($bp,$pwr,4), %eax - movq $out, %xmm0 # off-load arguments - lea 128($bp,$pwr,4), %rbp - mov ($bp,$pwr,4), %edx - movq $mod, %xmm1 - mov $n0, 128(%rsp) + movq %xmm8,%rdx + + mov $n0, 128(%rsp) # off-load arguments + mov $out, 128+8(%rsp) + mov $mod, 128+16(%rsp) - shl \$32, %rax - or %rax, %rdx mulx ($ap), %rbx, %r8 # 0 iteration mov %rbx, (%rsp) xor %edi, %edi # cf=0, of=0 mulx 8($ap), %rax, %r9 - movd (%rbp), %xmm4 mulx 16($ap), %rbx, %r10 - movd 64(%rbp), %xmm5 adcx %rax, %r8 mulx 24($ap), %rax, %r11 - pslldq \$4, %xmm5 adcx %rbx, %r9 mulx 32($ap), %rbx, %r12 - por %xmm5, %xmm4 adcx %rax, %r10 mulx 40($ap), %rax, %r13 adcx %rbx, %r11 mulx 48($ap), %rbx, %r14 - lea 128(%rbp), %rbp adcx %rax, %r12 mulx 56($ap), %rax, %r15 - movq %xmm4, %rdx adcx %rbx, %r13 adcx %rax, %r14 + .byte 0x67 mov %r8, %rbx adcx %rdi, %r15 # %rdi is 0 @@ -1145,24 +1218,48 @@ $code.=<<___ if ($addx); .align 32 .Loop_mulx_gather: - mulx ($ap), %rax, %r8 + movdqa 16*0(%rbp),%xmm8 + movdqa 16*1(%rbp),%xmm9 + movdqa 16*2(%rbp),%xmm10 + movdqa 16*3(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7(%rbp),%xmm15 + leaq 128(%rbp), %rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,%rdx + + .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 mulx 8($ap), %rax, %r9 - .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 adcx %rax, %r8 adox %r10, %r9 mulx 16($ap), %rax, %r10 - movd 64(%rbp), %xmm5 - lea 128(%rbp), %rbp adcx %rax, %r9 adox %r11, %r10 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 - pslldq \$4, %xmm5 - por %xmm5, %xmm4 adcx %rax, %r10 adox %r12, %r11 @@ -1176,10 +1273,10 @@ $code.=<<___ if ($addx); .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 adcx %rax, %r13 + .byte 0x67 adox %r15, %r14 mulx 56($ap), %rax, %r15 - movq %xmm4, %rdx mov %rbx, 64(%rsp,%rcx,8) adcx %rax, %r14 adox %rdi, %r15 @@ -1198,10 +1295,10 @@ $code.=<<___ if ($addx); mov %r14, 64+48(%rsp) mov %r15, 64+56(%rsp) - movq %xmm0, $out - movq %xmm1, %rbp + mov 128(%rsp), %rdx # pull arguments + mov 128+8(%rsp), $out + mov 128+16(%rsp), %rbp - mov 128(%rsp), %rdx # pull $n0 mov (%rsp), %r8 mov 8(%rsp), %r9 mov 16(%rsp), %r10 @@ -1229,6 +1326,21 @@ $code.=<<___; call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax +___ +$code.=<<___ if ($win64); + movaps 0xa0-0xc8(%rax),%xmm6 + movaps 0xb0-0xc8(%rax),%xmm7 + movaps 0xc0-0xc8(%rax),%xmm8 + movaps 0xd0-0xc8(%rax),%xmm9 + movaps 0xe0-0xc8(%rax),%xmm10 + movaps 0xf0-0xc8(%rax),%xmm11 + movaps 0x100-0xc8(%rax),%xmm12 + movaps 0x110-0xc8(%rax),%xmm13 + movaps 0x120-0xc8(%rax),%xmm14 + movaps 0x130-0xc8(%rax),%xmm15 + lea 0xb0(%rax),%rax +___ +$code.=<<___; movq -48(%rax), %r15 movq -40(%rax), %r14 movq -32(%rax), %r13 @@ -1258,7 +1370,7 @@ rsaz_512_mul_scatter4: mov $pwr, $pwr subq \$128+24, %rsp .Lmul_scatter4_body: - leaq ($tbl,$pwr,4), $tbl + leaq ($tbl,$pwr,8), $tbl movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 movq $tbl, %xmm2 @@ -1329,30 +1441,14 @@ $code.=<<___; call __rsaz_512_subtract - movl %r8d, 64*0($inp) # scatter - shrq \$32, %r8 - movl %r9d, 64*2($inp) - shrq \$32, %r9 - movl %r10d, 64*4($inp) - shrq \$32, %r10 - movl %r11d, 64*6($inp) - shrq \$32, %r11 - movl %r12d, 64*8($inp) - shrq \$32, %r12 - movl %r13d, 64*10($inp) - shrq \$32, %r13 - movl %r14d, 64*12($inp) - shrq \$32, %r14 - movl %r15d, 64*14($inp) - shrq \$32, %r15 - movl %r8d, 64*1($inp) - movl %r9d, 64*3($inp) - movl %r10d, 64*5($inp) - movl %r11d, 64*7($inp) - movl %r12d, 64*9($inp) - movl %r13d, 64*11($inp) - movl %r14d, 64*13($inp) - movl %r15d, 64*15($inp) + movq %r8, 128*0($inp) # scatter + movq %r9, 128*1($inp) + movq %r10, 128*2($inp) + movq %r11, 128*3($inp) + movq %r12, 128*4($inp) + movq %r13, 128*5($inp) + movq %r14, 128*6($inp) + movq %r15, 128*7($inp) leaq 128+24+48(%rsp), %rax movq -48(%rax), %r15 @@ -1956,16 +2052,14 @@ $code.=<<___; .type rsaz_512_scatter4,\@abi-omnipotent .align 16 rsaz_512_scatter4: - leaq ($out,$power,4), $out + leaq ($out,$power,8), $out movl \$8, %r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq ($inp), %rax leaq 8($inp), $inp - movl %eax, ($out) - shrq \$32, %rax - movl %eax, 64($out) + movq %rax, ($out) leaq 128($out), $out decl %r9d jnz .Loop_scatter @@ -1976,22 +2070,106 @@ rsaz_512_scatter4: .type rsaz_512_gather4,\@abi-omnipotent .align 16 rsaz_512_gather4: - leaq ($inp,$power,4), $inp +___ +$code.=<<___ if ($win64); +.LSEH_begin_rsaz_512_gather4: + .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp + .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) + .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) + .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) + .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) + .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) + .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) + .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) + .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) + .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) +___ +$code.=<<___; + movd $power,%xmm8 + movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 + movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 + + pshufd \$0,%xmm8,%xmm8 # broadcast $power + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..15 to $power +# +for($i=0;$i<4;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` + movdqa %xmm7,%xmm`$i+3` +___ +} +for(;$i<7;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` +___ +} +$code.=<<___; + pcmpeqd %xmm8,%xmm7 movl \$8, %r9d jmp .Loop_gather .align 16 .Loop_gather: - movl ($inp), %eax - movl 64($inp), %r8d + movdqa 16*0($inp),%xmm8 + movdqa 16*1($inp),%xmm9 + movdqa 16*2($inp),%xmm10 + movdqa 16*3($inp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4($inp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5($inp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6($inp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7($inp),%xmm15 leaq 128($inp), $inp - shlq \$32, %r8 - or %r8, %rax - movq %rax, ($out) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,($out) leaq 8($out), $out decl %r9d jnz .Loop_gather +___ +$code.=<<___ if ($win64); + movaps 0x00(%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + add \$0xa8,%rsp +___ +$code.=<<___; ret +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 ___ } @@ -2039,6 +2217,18 @@ se_handler: lea 128+24+48(%rax),%rax + lea .Lmul_gather4_epilogue(%rip),%rbx + cmp %r10,%rbx + jne .Lse_not_in_mul_gather4 + + lea 0xb0(%rax),%rax + + lea -48-0xa8(%rax),%rsi + lea 512($context),%rdi + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lse_not_in_mul_gather4: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -2090,7 +2280,7 @@ se_handler: pop %rdi pop %rsi ret -.size sqr_handler,.-sqr_handler +.size se_handler,.-se_handler .section .pdata .align 4 @@ -2114,6 +2304,10 @@ se_handler: .rva .LSEH_end_rsaz_512_mul_by_one .rva .LSEH_info_rsaz_512_mul_by_one + .rva .LSEH_begin_rsaz_512_gather4 + .rva .LSEH_end_rsaz_512_gather4 + .rva .LSEH_info_rsaz_512_gather4 + .section .xdata .align 8 .LSEH_info_rsaz_512_sqr: @@ -2136,6 +2330,19 @@ se_handler: .byte 9,0,0,0 .rva se_handler .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] +.LSEH_info_rsaz_512_gather4: + .byte 0x01,0x46,0x16,0x00 + .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 ___ } diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index e82e451388c7..29ba1224e36b 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -775,100 +775,126 @@ bn_sqr8x_mont: # 4096. this is done to allow memory disambiguation logic # do its job. # - lea -64(%rsp,$num,4),%r11 + lea -64(%rsp,$num,2),%r11 mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num - lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rsp .Lsqr8x_sp_done: and \$-64,%rsp - mov $num,%r10 + mov $num,%r10 neg $num - lea 64(%rsp,$num,2),%r11 # copy of modulus mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .Lsqr8x_body: - mov $num,$i - movq %r11, %xmm2 # save pointer to modulus copy - shr \$3+2,$i - mov OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 8*0($nptr),%xmm0 - movq 8*1($nptr),%xmm1 - movq 8*2($nptr),%xmm3 - movq 8*3($nptr),%xmm4 - lea 8*4($nptr),$nptr - movdqa %xmm0,16*0(%r11) - movdqa %xmm1,16*1(%r11) - movdqa %xmm3,16*2(%r11) - movdqa %xmm4,16*3(%r11) - lea 16*4(%r11),%r11 - dec $i - jnz .Lsqr8x_copy_n - + movq $nptr, %xmm2 # save pointer to modulus pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq %r10, %xmm3 # -$num ___ $code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%eax and \$0x80100,%eax cmp \$0x80100,%eax jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module - - pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - lea 64(%rsp,$num,2),%rdx - shr \$3+2,$num - mov 40(%rsp),%rsi # restore %rsp - jmp .Lsqr8x_zero + # %rax top-most carry + # %rbp nptr + # %rcx -8*num + # %r8 end of tp[2*num] + lea (%r8,%rcx),%rbx + mov %rcx,$num + mov %rcx,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module + # %rax top-most carry + # %rbp nptr + # %r8 -8*num + # %rdi end of tp[2*num] + lea (%rdi,$num),%rbx + mov $num,%rcx + mov $num,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub +.align 32 +.Lsqr8x_sub: + mov 8*0(%rbx),%r12 + mov 8*1(%rbx),%r13 + mov 8*2(%rbx),%r14 + mov 8*3(%rbx),%r15 + lea 8*4(%rbx),%rbx + sbb 8*0(%rbp),%r12 + sbb 8*1(%rbp),%r13 + sbb 8*2(%rbp),%r14 + sbb 8*3(%rbp),%r15 + lea 8*4(%rbp),%rbp + mov %r12,8*0($rptr) + mov %r13,8*1($rptr) + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + inc %rcx # preserves %cf + jnz .Lsqr8x_sub + + sbb \$0,%rax # top-most carry + lea (%rbx,$num),%rbx # rewind + lea ($rptr,$num),$rptr # rewind + + movq %rax,%xmm1 pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - lea 64(%rsp,$num,2),%rdx - shr \$3+2,$num + pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp - jmp .Lsqr8x_zero + jmp .Lsqr8x_cond_copy .align 32 -.Lsqr8x_zero: - movdqa %xmm0,16*0(%rax) # wipe t - movdqa %xmm0,16*1(%rax) - movdqa %xmm0,16*2(%rax) - movdqa %xmm0,16*3(%rax) - lea 16*4(%rax),%rax - movdqa %xmm0,16*0(%rdx) # wipe n - movdqa %xmm0,16*1(%rdx) - movdqa %xmm0,16*2(%rdx) - movdqa %xmm0,16*3(%rdx) - lea 16*4(%rdx),%rdx - dec $num - jnz .Lsqr8x_zero +.Lsqr8x_cond_copy: + movdqa 16*0(%rbx),%xmm2 + movdqa 16*1(%rbx),%xmm3 + lea 16*2(%rbx),%rbx + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2(%rbx) # zero tp + movdqa %xmm0,-16*1(%rbx) + movdqa %xmm0,-16*2(%rbx,%rdx) + movdqa %xmm0,-16*1(%rbx,%rdx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + add \$32,$num + jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 @@ -1135,64 +1161,75 @@ $code.=<<___; adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 - mov -8($nptr),$mi sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer - sub %r14,$mi # compare top-most words - sbb $mi,$mi - or $mi,%r15 - - neg $num - xor %rdx,%rdx + lea 64(%rsp),$tptr + sub $num,$nptr # rewind $nptr + neg %r15 + mov $num,%rdx + shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + mov 8*0($tptr),%r11 + mov 8*1($tptr),%r12 + mov 8*2($tptr),%r13 + mov 8*3($tptr),%r14 + lea 8*4($tptr),$tptr + sbb 8*0($nptr),%r11 + sbb 8*1($nptr),%r12 + sbb 8*2($nptr),%r13 + sbb 8*3($nptr),%r14 + lea 8*4($nptr),$nptr + mov %r11,8*0($rptr) + mov %r12,8*1($rptr) + mov %r13,8*2($rptr) + mov %r14,8*3($rptr) + lea 8*4($rptr),$rptr + dec $num # preserves %cf + jnz .Lmulx4x_sub + + sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr + sub %rdx,$rptr # rewind + movq %r15,%xmm1 pxor %xmm0,%xmm0 - mov 0*8($nptr,$num),%r8 - mov 1*8($nptr,$num),%r9 - neg %r8 - jmp .Lmulx4x_sub_entry + pshufd \$0,%xmm1,%xmm1 + mov 40(%rsp),%rsi # restore %rsp + jmp .Lmulx4x_cond_copy .align 32 -.Lmulx4x_sub: - mov 0*8($nptr,$num),%r8 - mov 1*8($nptr,$num),%r9 - not %r8 -.Lmulx4x_sub_entry: - mov 2*8($nptr,$num),%r10 - not %r9 - and %r15,%r8 - mov 3*8($nptr,$num),%r11 - not %r10 - and %r15,%r9 - not %r11 - and %r15,%r10 - and %r15,%r11 - - neg %rdx # mov %rdx,%cf - adc 0*8($tptr),%r8 - adc 1*8($tptr),%r9 - movdqa %xmm0,($tptr) - adc 2*8($tptr),%r10 - adc 3*8($tptr),%r11 - movdqa %xmm0,16($tptr) - lea 4*8($tptr),$tptr - sbb %rdx,%rdx # mov %cf,%rdx +.Lmulx4x_cond_copy: + movdqa 16*0($tptr),%xmm2 + movdqa 16*1($tptr),%xmm3 + lea 16*2($tptr),$tptr + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2($tptr) # zero tp + movdqa %xmm0,-16*1($tptr) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + sub \$32,%rdx + jnz .Lmulx4x_cond_copy - mov %r8,0*8($rptr) - mov %r9,1*8($rptr) - mov %r10,2*8($rptr) - mov %r11,3*8($rptr) - lea 4*8($rptr),$rptr + mov %rdx,($tptr) - add \$32,$num - jnz .Lmulx4x_sub - - mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax mov -48(%rsi),%r15 mov -40(%rsi),%r14 diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 292409c4ffb8..2e8c9db32cbc 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -99,58 +99,111 @@ $code.=<<___; .Lmul_enter: mov ${num}d,${num}d mov %rsp,%rax - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument + lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; + lea 2($num),%r11 neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) + lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) and \$-1024,%rsp # minimize TLB usage mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: - mov $bp,%r12 # reassign $bp + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 + lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) + and \$-16,%r10 + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + .byte 0x67 + movdqa %xmm4,%xmm3 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($k+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($k+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($k+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($k+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($k+0)+112`(%r10) + + paddd %xmm2,%xmm3 + .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($k+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($k+2)+112`(%r10) + pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($k+1)-128`($bp),%xmm1 + pand `16*($k+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($k+3)+112`(%r10) + pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm4 + movdqa `16*($k+1)-128`($bp),%xmm5 + movdqa `16*($k+2)-128`($bp),%xmm2 + pand `16*($k+0)+112`(%r10),%xmm4 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($k+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($k+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value @@ -159,29 +212,14 @@ $code.=<<___; xor $i,$i # i=0 xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -212,16 +250,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j - jne .L1st - - movq %xmm0,$m0 # bp[1] + jne .L1st # note that upon exit $j==$num, so + # they can be used interchangeably add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 @@ -235,33 +271,48 @@ $code.=<<___; jmp .Louter .align 16 .Louter: + lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) + and \$-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($k=0;$k<$STRIDE/16;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm0 + movdqa `16*($k+1)-128`($bp),%xmm1 + movdqa `16*($k+2)-128`($bp),%xmm2 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+0)-128`(%rdx),%xmm0 + pand `16*($k+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($k+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($k+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + + mov ($ap),%rax # ap[0] + movq %xmm0,$m0 # m0=bp[i] + xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -295,17 +346,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j - jne .Linner - - movq %xmm0,$m0 # bp[i+1] - + jne .Linner # note that upon exit $j==$num, so + # they can be used interchangeably add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] - mov (%rsp,$j,8),$lo0 + mov (%rsp,$num,8),$lo0 adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx @@ -352,12 +400,7 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -379,8 +422,8 @@ bn_mul4x_mont_gather5: .Lmul4x_enter: ___ $code.=<<___ if ($addx); - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lmulx4x_enter ___ $code.=<<___; @@ -392,39 +435,34 @@ $code.=<<___; push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; + .byte 0x67 - mov ${num}d,%r10d - shl \$3,${num}d - shl \$3+2,%r10d # 4*$num + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $ap - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + sub %r11,%rsp # align with $rp + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - lea 4096-64(,$num,2),%r10 - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -440,12 +478,7 @@ $code.=<<___; mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -460,9 +493,10 @@ $code.=<<___; .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: - shl \$5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument - lea 256(%rdx,$num),%r13 + shl \$5,$num # $num was in bytes + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index + lea .Linc(%rip),%rax + lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; @@ -470,44 +504,92 @@ ___ $N=$STRIDE/4; # should match cache line size $tp=$i; $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - lea $STRIDE($bp),$tp # borrow $tp - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - .byte 0x67 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tp),%xmm1 - .byte 0x67 - pand %xmm7,%xmm3 - .byte 0x67 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tp),%xmm2 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128(%rdx),$bp # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67,0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq `2*$STRIDE/4-96`($tp),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($i+1)-128`($bp),%xmm1 + pand `16*($i+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm4 + movdqa `16*($i+1)-128`($bp),%xmm5 + movdqa `16*($i+2)-128`($bp),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] - movq `3*$STRIDE/4-96`($tp),%xmm0 + mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp @@ -521,26 +603,10 @@ $code.=<<___; mov %rax,$A[0] mov ($np),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq $A[0],$m1 # "tp[0]"*n0 - ############################################################## - # $tp is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8(%rsp,%r11,8),$tp + lea 64+8(%rsp),$tp mov %rdx,$A[1] - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - lea 2*$STRIDE($bp),$bp - por %xmm1,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax @@ -549,7 +615,7 @@ $code.=<<___; mulq $m0 add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -559,7 +625,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] @@ -569,7 +635,7 @@ $code.=<<___; .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -585,7 +651,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -600,7 +666,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] @@ -615,7 +681,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -624,7 +690,7 @@ $code.=<<___; mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] @@ -634,7 +700,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -650,7 +716,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -663,8 +729,7 @@ $code.=<<___; mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -675,6 +740,33 @@ $code.=<<___; .align 32 .Louter4x: + lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm0 + movdqa `16*($i+1)-128`($bp),%xmm1 + movdqa `16*($i+2)-128`($bp),%xmm2 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+0)-128`(%rdx),%xmm0 + pand `16*($i+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] @@ -682,25 +774,11 @@ $code.=<<___; mov ($np),%rax adc \$0,%rdx - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - imulq $A[0],$m1 # tp[0]*n0 - .byte 0x67 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 lea ($tp,$num),$tp # rewind $tp - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded @@ -710,7 +788,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx @@ -722,7 +800,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x @@ -731,7 +809,7 @@ $code.=<<___; .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -749,7 +827,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -766,7 +844,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx @@ -783,7 +861,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx @@ -794,7 +872,7 @@ $code.=<<___; mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] @@ -804,7 +882,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -823,7 +901,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax - mov -16*1($np),$m1 + mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -838,9 +916,8 @@ $code.=<<___; mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[i+1] mov $N[1],-16($tp) # tp[j-1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -854,16 +931,23 @@ $code.=<<___; ___ if (1) { $code.=<<___; + xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] - xor \$1,$N[1] + sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub - lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub + mov ($np),%r12 + lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx - sar \$3+2,%rcx # cf=0 + sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub - jmp .Lsqr4x_sub + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1(%rbp),%r13 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqr4x_sub_entry ___ } else { my @ri=("%rax",$bp,$m0,$m1); @@ -930,8 +1014,8 @@ bn_power5: ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lpowerx5_enter ___ $code.=<<___; @@ -942,38 +1026,32 @@ $code.=<<___; push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -995,16 +1073,21 @@ $code.=<<___; mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .Lpower5_body: - movq $rptr,%xmm1 # save $rptr + movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr - movq %r10, %xmm3 # -$num + movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr @@ -1565,9 +1648,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr -sqr8x_reduction: +__bn_sqr8x_reduction: xor %rax,%rax - lea ($nptr,$num,2),%rcx # end of n[] + lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window @@ -1593,21 +1676,21 @@ sqr8x_reduction: .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 - mov 16*1($nptr),%rax # n[1] + mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] @@ -1616,7 +1699,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry @@ -1625,7 +1708,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 @@ -1634,7 +1717,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1642,7 +1725,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1650,7 +1733,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1659,7 +1742,7 @@ sqr8x_reduction: mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 @@ -1668,7 +1751,7 @@ sqr8x_reduction: dec %ecx jnz .L8x_reduce - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? @@ -1687,21 +1770,21 @@ sqr8x_reduction: mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx - mov 16*0($nptr),%rax + mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 - mov 16*1($nptr),%rax + mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ @@ -1710,7 +1793,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 @@ -1718,7 +1801,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 @@ -1726,7 +1809,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1734,7 +1817,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1742,7 +1825,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1753,14 +1836,14 @@ sqr8x_reduction: add %rax,%r15 adc \$0,%rdx add %r15,%r14 - mov 16*0($nptr),%rax # pull n[0] + mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop @@ -1806,7 +1889,7 @@ sqr8x_reduction: adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry - mov -16($nptr),%rcx # np[num-1] + mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr @@ -1824,6 +1907,8 @@ sqr8x_reduction: cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop + ret +.size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## @@ -1832,48 +1917,62 @@ ___ { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; - #xor %rsi,%rsi # %rsi was $carry above - sub %r15,%rcx # compare top-most words +.type __bn_post4x_internal,\@abi-omnipotent +.align 32 +__bn_post4x_internal: + mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above - adc %rsi,%rsi mov $num,%rcx - or %rsi,%rax movq %xmm1,$rptr # restore $rptr - xor \$1,%rax + neg %rax movq %xmm1,$aptr # prepare for back-to-back call - lea ($nptr,%rax,8),$nptr - sar \$3+2,%rcx # cf=0 - jmp .Lsqr4x_sub + sar \$3+2,%rcx + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqr4x_sub_entry: + lea 8*4($nptr),$nptr + not %r12 + not %r13 + not %r14 + not %r15 + and %rax,%r12 + and %rax,%r13 + and %rax,%r14 + and %rax,%r15 + + neg %r10 # mov %r10,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # pass %cf jnz .Lsqr4x_sub -___ -} -$code.=<<___; + mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.size __bn_post4x_internal,.-__bn_post4x_internal ___ +} { $code.=<<___; .globl bn_from_montgomery @@ -1897,39 +1996,32 @@ bn_from_mont8x: push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). The stack is allocated to aligned with + # bn_power5's frame, and as bn_from_montgomery happens to be + # last operation, we use the opportunity to cleanse it. # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -1983,12 +2075,13 @@ $code.=<<___; ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 jne .Lfrom_mont_nox lea (%rax,$num),$rptr - call sqrx8x_reduction + call __bn_sqrx8x_reduction + call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax @@ -1999,7 +2092,8 @@ $code.=<<___ if ($addx); .Lfrom_mont_nox: ___ $code.=<<___; - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax @@ -2039,7 +2133,6 @@ $code.=<<___; .align 32 bn_mulx4x_mont_gather5: .Lmulx4x_enter: - .byte 0x67 mov %rsp,%rax push %rbx push %rbp @@ -2047,40 +2140,33 @@ bn_mulx4x_mont_gather5: push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers a[num], ret[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done -.align 32 .Lmulx4xsp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -2106,12 +2192,7 @@ $code.=<<___; mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -2126,14 +2207,16 @@ $code.=<<___; .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: - .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num - .byte 0x67 + mov $num,8(%rsp) # save -$num (it was in bytes) + mov $num,%r10 neg $num # restore $num shl \$5,$num - lea 256($bp,$num),%r13 + neg %r10 # restore $num + lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num + lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp @@ -2144,52 +2227,92 @@ my $rptr=$bptr; my $STRIDE=2**5*8; # 5 is "window size" my $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bptr # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bptr),%xmm0 - lea $STRIDE($bptr),$tptr # borrow $tptr - movq `1*$STRIDE/4-96`($bptr),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tptr),%xmm1 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tptr),%xmm2 - por %xmm3,%xmm0 - .byte 0x67,0x67 - pand %xmm4,%xmm1 - movq `2*$STRIDE/4-96`($tptr),%xmm3 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) + lea 128($bp),$bptr # size optimization + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + .byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + .byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + + pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register + pand `16*($i+1)-128`($bptr),%xmm1 + pand `16*($i+2)-128`($bptr),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bptr),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm4 + movdqa `16*($i+1)-128`($bptr),%xmm5 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + pxor %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] - movq `3*$STRIDE/4-96`($tptr),%xmm0 - lea 2*$STRIDE($bptr),$bptr # next &b[i] - pand %xmm5,%xmm2 - .byte 0x67,0x67 - pand %xmm6,%xmm3 - ############################################################## - # $tptr is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8*4+8(%rsp,%r11,8),$tptr + lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] @@ -2205,37 +2328,31 @@ $code.=<<___; xor $zero,$zero # cf=0, of=0 mov $mi,%rdx - por %xmm2,%xmm1 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 mov $bptr,8+8(%rsp) # off-load &b[i] - por %xmm1,%xmm0 - .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr + lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value - .byte 0x66 mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 - .byte 0x67,0x67 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 - .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) - #jmp .Lmulx4x_1st + jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: @@ -2255,30 +2372,29 @@ $code.=<<___; lea 4*8($tptr),$tptr adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[1] adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 @@ -2289,6 +2405,34 @@ $code.=<<___; .align 32 .Lmulx4x_outer: + lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) + pxor %xmm4,%xmm4 + .byte 0x67,0x67 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm0 + movdqa `16*($i+1)-128`($bptr),%xmm1 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+256`(%r10),%xmm0 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+256`(%r10),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)+256`(%r10),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)+256`(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bptr),$bptr + movq %xmm0,%rdx # m0=bp[i] + mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] @@ -2303,54 +2447,37 @@ $code.=<<___; mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 - lea ($nptr,$num,2),$nptr # rewind $nptr + lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 - .byte 0x67 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 - movq `0*$STRIDE/4-96`($bptr),%xmm0 - .byte 0x67,0x67 mov $mi,%rdx - movq `1*$STRIDE/4-96`($bptr),%xmm1 - .byte 0x67 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - .byte 0x67 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - add \$$STRIDE,$bptr # next &b[i] - .byte 0x67 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - por %xmm2,%xmm0 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) - por %xmm3,%xmm0 adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 @@ -2375,20 +2502,20 @@ $code.=<<___; adcx $zero,%r14 # cf=0 adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 @@ -2398,7 +2525,6 @@ $code.=<<___; jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[i+1] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] @@ -2411,20 +2537,26 @@ $code.=<<___; cmp %r10,$bptr jb .Lmulx4x_outer - mov -16($nptr),%r10 + mov -8($nptr),%r10 + mov $zero,%r8 + mov ($nptr,$num),%r12 + lea ($nptr,$num),%rbp # rewind $nptr + mov $num,%rcx + lea ($tptr,$num),%rdi # rewind $tptr + xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 - or %r15,$zero - xor \$1,$zero - lea ($tptr,$num),%rdi # rewind $tptr - lea ($nptr,$num,2),$nptr # rewind $nptr - .byte 0x67,0x67 - sar \$3+2,$num # cf=0 - lea ($nptr,$zero,8),%rbp + or %r15,%r8 + sar \$3+2,%rcx + sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp - mov $num,%rcx - jmp .Lsqrx4x_sub # common post-condition + dec %r12 # so that after 'not' we get -n[0] + mov 8*1(%rbp),%r13 + xor %r8,%r8 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqrx4x_sub_entry # common post-condition .size mulx4x_internal,.-mulx4x_internal ___ }{ @@ -2448,7 +2580,6 @@ $code.=<<___; .align 32 bn_powerx5: .Lpowerx5_enter: - .byte 0x67 mov %rsp,%rax push %rbx push %rbp @@ -2456,39 +2587,32 @@ bn_powerx5: push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -2519,10 +2643,15 @@ $code.=<<___; .Lpowerx5_body: call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr @@ -2534,12 +2663,7 @@ $code.=<<___; mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -2973,11 +3097,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr -sqrx8x_reduction: +__bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) - lea -128($nptr,$num,2),%rcx # end of n[] + lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] @@ -3006,23 +3130,23 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_reduce: mov %r8, %rbx - mulx 16*0($nptr),%rax,%r8 # n[0] + mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 - mulx 16*1($nptr),%rbx,%r9 # n[1] + mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rbx,%r10 + mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rbx,%r11 + mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 @@ -3032,15 +3156,15 @@ sqrx8x_reduction: mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 @@ -3056,7 +3180,7 @@ sqrx8x_reduction: mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 @@ -3075,35 +3199,35 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_tail: mov %r8,%rbx - mulx 16*0($nptr),%rax,%r8 + mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 - mulx 16*1($nptr),%rax,%r9 + mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rax,%r10 + mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rax,%r11 + mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 @@ -3119,7 +3243,7 @@ sqrx8x_reduction: sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 @@ -3155,7 +3279,7 @@ sqrx8x_reduction: adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 - mov 16*7($nptr),$carry + mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 @@ -3181,6 +3305,8 @@ sqrx8x_reduction: lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop + ret +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## @@ -3188,52 +3314,59 @@ ___ # { my ($rptr,$nptr)=("%rdx","%rbp"); -my @ri=map("%r$_",(10..13)); -my @ni=map("%r$_",(14..15)); $code.=<<___; - xor %ebx,%ebx - sub %r15,%rsi # compare top-most words - adc %rbx,%rbx +.align 32 +__bn_postx4x_internal: + mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num - or %rbx,%rax mov %rcx,%r9 # -$num - xor \$1,%rax - sar \$3+2,%rcx # cf=0 + neg %rax + sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr - lea ($nptr,%rax,8),$nptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call - jmp .Lsqrx4x_sub + dec %r12 # so that after 'not' we get -n[0] + mov 8*1($nptr),%r13 + xor %r8,%r8 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqrx4x_sub_entry -.align 32 +.align 16 .Lsqrx4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqrx4x_sub_entry: + andn %rax,%r12,%r12 + lea 8*4($nptr),$nptr + andn %rax,%r13,%r13 + andn %rax,%r14,%r14 + andn %rax,%r15,%r15 + + neg %r8 # mov %r8,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx jnz .Lsqrx4x_sub -___ -} -$code.=<<___; + neg %r9 # restore $num ret -.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.size __bn_postx4x_internal,.-__bn_postx4x_internal ___ +} }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order @@ -3282,56 +3415,91 @@ bn_scatter5: .globl bn_gather5 .type bn_gather5,\@abi-omnipotent -.align 16 +.align 32 bn_gather5: -___ -$code.=<<___ if ($win64); -.LSEH_begin_bn_gather5: +.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp + lea .Linc(%rip),%rax + and \$-16,%rsp # shouldn't be formally required + + movd $idx,%xmm5 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 128($tbl),%r11 # size optimization + lea 128(%rsp),%rax # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast $idx + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 ___ +######################################################################## +# calculate mask by comparing 0..31 to $idx and save result to stack +# +for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; - mov $idx,%r11d - shr \$`log($N/8)/log(2)`,$idx - and \$`$N/8-1`,%r11 - not $idx - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,$idx # 5 is "window size" - lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line - movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which - movq 8(%rax,$idx,8),%xmm5 # cache line contains element - movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument - movq 24(%rax,$idx,8),%xmm7 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 +___ +$code.=<<___ if ($i); + movdqa %xmm3,`16*($i-1)-128`(%rax) +___ +$code.=<<___; + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)-128`(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)-128`(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)-128`(%rax) + movdqa %xmm4,%xmm2 +___ +} +$code.=<<___; + movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather -.align 16 -.Lgather: - movq `0*$STRIDE/4-128`($tbl),%xmm0 - movq `1*$STRIDE/4-128`($tbl),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-128`($tbl),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-128`($tbl),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - .byte 0x67,0x67 - por %xmm2,%xmm0 - lea $STRIDE($tbl),$tbl - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`(%r11),%xmm0 + movdqa `16*($i+1)-128`(%r11),%xmm1 + movdqa `16*($i+2)-128`(%r11),%xmm2 + pand `16*($i+0)-128`(%rax),%xmm0 + movdqa `16*($i+3)-128`(%r11),%xmm3 + pand `16*($i+1)-128`(%rax),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rax),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + lea $STRIDE(%r11),%r11 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - lea 0x28(%rsp),%rsp -___ -$code.=<<___; + + lea (%r10),%rsp ret .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 @@ -3339,9 +3507,9 @@ ___ } $code.=<<___; .align 64 -.Lmagic_masks: - .long 0,0, 0,0, 0,0, -1,-1 - .long 0,0, 0,0, 0,0, 0,0 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" ___ @@ -3389,19 +3557,16 @@ mul_handler: lea .Lmul_epilogue(%rip),%r10 cmp %r10,%rbx - jb .Lbody_40 + ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer + jmp .Lbody_proceed .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer .Lbody_proceed: - - movaps -88(%rax),%xmm0 - movaps -72(%rax),%xmm1 - mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3414,8 +3579,6 @@ mul_handler: mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 - movups %xmm0,512($context) # restore context->Xmm6 - movups %xmm1,528($context) # restore context->Xmm7 .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -3526,10 +3689,9 @@ ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: - .byte 0x01,0x0d,0x05,0x00 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 + .byte 0x01,0x0b,0x03,0x0a + .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 + .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 5696965e9a09..86264ae6315f 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -125,6 +125,7 @@ #ifndef HEADER_BN_H # define HEADER_BN_H +# include <limits.h> # include <openssl/e_os2.h> # ifndef OPENSSL_NO_FP_API # include <stdio.h> /* FILE */ @@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void); /* library internal functions */ -# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\ - (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2)) +# define bn_expand(a,bits) \ + ( \ + bits > (INT_MAX - BN_BITS2 + 1) ? \ + NULL \ + : \ + (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \ + (a) \ + : \ + bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \ + ) + # define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words))) BIGNUM *bn_expand2(BIGNUM *a, int words); # ifndef OPENSSL_NO_DEPRECATED diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index 6d30d1e0fff5..1670f01d1d8c 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -110,6 +110,7 @@ */ #include "cryptlib.h" +#include "constant_time_locl.h" #include "bn_lcl.h" #include <stdlib.h> @@ -606,15 +607,17 @@ static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos) static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, - int width) + int window) { - size_t i, j; + int i, j; + int width = 1 << window; + BN_ULONG *table = (BN_ULONG *)buf; if (top > b->top) top = b->top; /* this works because 'buf' is explicitly * zeroed */ - for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) { - buf[j] = ((unsigned char *)b->d)[i]; + for (i = 0, j = idx; i < top; i++, j += width) { + table[j] = b->d[i]; } return 1; @@ -622,15 +625,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, - int width) + int window) { - size_t i, j; + int i, j; + int width = 1 << window; + volatile BN_ULONG *table = (volatile BN_ULONG *)buf; if (bn_wexpand(b, top) == NULL) return 0; - for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) { - ((unsigned char *)b->d)[i] = buf[j]; + if (window <= 3) { + for (i = 0; i < top; i++, table += width) { + BN_ULONG acc = 0; + + for (j = 0; j < width; j++) { + acc |= table[j] & + ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); + } + + b->d[i] = acc; + } + } else { + int xstride = 1 << (window - 2); + BN_ULONG y0, y1, y2, y3; + + i = idx >> (window - 2); /* equivalent of idx / xstride */ + idx &= xstride - 1; /* equivalent of idx % xstride */ + + y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1); + y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1); + y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1); + y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1); + + for (i = 0; i < top; i++, table += width) { + BN_ULONG acc = 0; + + for (j = 0; j < xstride; j++) { + acc |= ( (table[j + 0 * xstride] & y0) | + (table[j + 1 * xstride] & y1) | + (table[j + 2 * xstride] & y2) | + (table[j + 3 * xstride] & y3) ) + & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); + } + + b->d[i] = acc; + } } b->top = top; @@ -749,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (window >= 5) { window = 5; /* ~5% improvement for RSA2048 sign, and even * for RSA4096 */ - if ((top & 7) == 0) - powerbufLen += 2 * top * sizeof(m->d[0]); + /* reserve space for mont->N.d[] copy */ + powerbufLen += top * sizeof(mont->N.d[0]); } #endif (void)0; @@ -971,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BN_ULONG *not_used, const BN_ULONG *np, const BN_ULONG *n0, int num); - BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2; + BN_ULONG *n0 = mont->n0, *np; /* * BN_to_montgomery can contaminate words above .top [in @@ -982,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, for (i = tmp.top; i < top; i++) tmp.d[i] = 0; - if (top & 7) - np2 = np; - else - for (np2 = am.d + top, i = 0; i < top; i++) - np2[2 * i] = np[i]; + /* + * copy mont->N.d[] to improve cache locality + */ + for (np = am.d + top, i = 0; i < top; i++) + np[i] = mont->N.d[i]; bn_scatter5(tmp.d, top, powerbuf, 0); bn_scatter5(am.d, am.top, powerbuf, 1); @@ -996,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, # if 0 for (i = 3; i < 32; i++) { /* Calculate a^i = a^(i-1) * a */ - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # else @@ -1007,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } for (i = 3; i < 8; i += 2) { int j; - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); for (j = 2 * i; j < 32; j *= 2) { bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); @@ -1015,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } } for (; i < 16; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); bn_scatter5(tmp.d, top, powerbuf, 2 * i); } for (; i < 32; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # endif @@ -1050,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, while (bits >= 0) { wvalue = bn_get_bits5(p->d, bits - 4); bits -= 5; - bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue); + bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); } } - ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top); + ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top); tmp.top = top; bn_correct_top(&tmp); if (ret) { @@ -1065,9 +1104,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } else #endif { - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window)) goto err; /* @@ -1079,15 +1118,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (window > 1) { if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF - (&tmp, top, powerbuf, 2, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, + window)) goto err; for (i = 3; i < numPowers; i++) { /* Calculate a^i = a^(i-1) * a */ if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF - (&tmp, top, powerbuf, i, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, + window)) goto err; } } @@ -1095,8 +1134,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, bits--; for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - if (!MOD_EXP_CTIME_COPY_FROM_PREBUF - (&tmp, top, powerbuf, wvalue, numPowers)) + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue, + window)) goto err; /* @@ -1116,8 +1155,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, /* * Fetch the appropriate pre-computed value from the pre-buf */ - if (!MOD_EXP_CTIME_COPY_FROM_PREBUF - (&am, top, powerbuf, wvalue, numPowers)) + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, + window)) goto err; /* Multiply the result into the intermediate result */ diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c index ab10b957ba27..bfa31efc5621 100644 --- a/crypto/bn/bn_print.c +++ b/crypto/bn/bn_print.c @@ -58,6 +58,7 @@ #include <stdio.h> #include <ctype.h> +#include <limits.h> #include "cryptlib.h" #include <openssl/buffer.h> #include "bn_lcl.h" @@ -189,7 +190,11 @@ int BN_hex2bn(BIGNUM **bn, const char *a) a++; } - for (i = 0; isxdigit((unsigned char)a[i]); i++) ; + for (i = 0; i <= (INT_MAX/4) && isxdigit((unsigned char)a[i]); i++) + continue; + + if (i > INT_MAX/4) + goto err; num = i + neg; if (bn == NULL) @@ -204,7 +209,7 @@ int BN_hex2bn(BIGNUM **bn, const char *a) BN_zero(ret); } - /* i is the number of hex digests; */ + /* i is the number of hex digits */ if (bn_expand(ret, i * 4) == NULL) goto err; @@ -260,7 +265,11 @@ int BN_dec2bn(BIGNUM **bn, const char *a) a++; } - for (i = 0; isdigit((unsigned char)a[i]); i++) ; + for (i = 0; i <= (INT_MAX/4) && isdigit((unsigned char)a[i]); i++) + continue; + + if (i > INT_MAX/4) + goto err; num = i + neg; if (bn == NULL) @@ -278,7 +287,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a) BN_zero(ret); } - /* i is the number of digests, a bit of an over expand; */ + /* i is the number of digits, a bit of an over expand */ if (bn_expand(ret, i * 4) == NULL) goto err; diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c index 7497ac624d94..f047040efe03 100644 --- a/crypto/bn/bn_recp.c +++ b/crypto/bn/bn_recp.c @@ -65,6 +65,7 @@ void BN_RECP_CTX_init(BN_RECP_CTX *recp) BN_init(&(recp->N)); BN_init(&(recp->Nr)); recp->num_bits = 0; + recp->shift = 0; recp->flags = 0; } diff --git a/crypto/cmac/cmac.c b/crypto/cmac/cmac.c index 774e6dc91905..2954b6eb7dcf 100644 --- a/crypto/cmac/cmac.c +++ b/crypto/cmac/cmac.c @@ -160,6 +160,14 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS); return 0; } + + /* Switch to FIPS cipher implementation if possible */ + if (cipher != NULL) { + const EVP_CIPHER *fcipher; + fcipher = FIPS_get_cipherbynid(EVP_CIPHER_nid(cipher)); + if (fcipher != NULL) + cipher = fcipher; + } /* * Other algorithm blocking will be done in FIPS_cmac_init, via * FIPS_cipherinit(). diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c index c9f674ba8e62..1925428f5ec5 100644 --- a/crypto/cryptlib.c +++ b/crypto/cryptlib.c @@ -1016,11 +1016,11 @@ void *OPENSSL_stderr(void) return stderr; } -int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) +int CRYPTO_memcmp(const volatile void *in_a, const volatile void *in_b, size_t len) { size_t i; - const unsigned char *a = in_a; - const unsigned char *b = in_b; + const volatile unsigned char *a = in_a; + const volatile unsigned char *b = in_b; unsigned char x = 0; for (i = 0; i < len; i++) diff --git a/crypto/crypto.h b/crypto/crypto.h index c450d7a3c374..6c644ce12a82 100644 --- a/crypto/crypto.h +++ b/crypto/crypto.h @@ -628,7 +628,7 @@ void OPENSSL_init(void); * into a defined order as the return value when a != b is undefined, other * than to be non-zero. */ -int CRYPTO_memcmp(const void *a, const void *b, size_t len); +int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len); /* BEGIN ERROR CODES */ /* diff --git a/crypto/dh/dh.h b/crypto/dh/dh.h index 5498a9dc1060..a5bd9016aae8 100644 --- a/crypto/dh/dh.h +++ b/crypto/dh/dh.h @@ -174,7 +174,7 @@ struct dh_st { /* DH_check_pub_key error codes */ # define DH_CHECK_PUBKEY_TOO_SMALL 0x01 # define DH_CHECK_PUBKEY_TOO_LARGE 0x02 -# define DH_CHECK_PUBKEY_INVALID 0x03 +# define DH_CHECK_PUBKEY_INVALID 0x04 /* * primes p where (p-1)/2 is prime too are called "safe"; we define this for diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c index 5adedc0d264e..027704111432 100644 --- a/crypto/dh/dh_check.c +++ b/crypto/dh/dh_check.c @@ -160,13 +160,12 @@ int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *ret) goto err; BN_CTX_start(ctx); tmp = BN_CTX_get(ctx); - if (tmp == NULL) + if (tmp == NULL || !BN_set_word(tmp, 1)) goto err; - BN_set_word(tmp, 1); if (BN_cmp(pub_key, tmp) <= 0) *ret |= DH_CHECK_PUBKEY_TOO_SMALL; - BN_copy(tmp, dh->p); - BN_sub_word(tmp, 1); + if (BN_copy(tmp, dh->p) == NULL || !BN_sub_word(tmp, 1)) + goto err; if (BN_cmp(pub_key, tmp) >= 0) *ret |= DH_CHECK_PUBKEY_TOO_LARGE; diff --git a/crypto/dsa/dsa_ameth.c b/crypto/dsa/dsa_ameth.c index c40e1777ade1..cc83d6e6ad3b 100644 --- a/crypto/dsa/dsa_ameth.c +++ b/crypto/dsa/dsa_ameth.c @@ -191,6 +191,8 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8) STACK_OF(ASN1_TYPE) *ndsa = NULL; DSA *dsa = NULL; + int ret = 0; + if (!PKCS8_pkey_get0(NULL, &p, &pklen, &palg, p8)) return 0; X509_ALGOR_get0(NULL, &ptype, &pval, palg); @@ -262,23 +264,21 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8) } EVP_PKEY_assign_DSA(pkey, dsa); - BN_CTX_free(ctx); - if (ndsa) - sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free); - else - ASN1_STRING_clear_free(privkey); - return 1; + ret = 1; + goto done; decerr: - DSAerr(DSA_F_DSA_PRIV_DECODE, EVP_R_DECODE_ERROR); + DSAerr(DSA_F_DSA_PRIV_DECODE, DSA_R_DECODE_ERROR); dsaerr: + DSA_free(dsa); + done: BN_CTX_free(ctx); - if (privkey) + if (ndsa) + sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free); + else ASN1_STRING_clear_free(privkey); - sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free); - DSA_free(dsa); - return 0; + return ret; } static int dsa_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey) diff --git a/crypto/dso/dso_lib.c b/crypto/dso/dso_lib.c index 3312450eae67..2beb7c1ba542 100644 --- a/crypto/dso/dso_lib.c +++ b/crypto/dso/dso_lib.c @@ -122,6 +122,7 @@ DSO *DSO_new_method(DSO_METHOD *meth) ret->meth = meth; ret->references = 1; if ((ret->meth->init != NULL) && !ret->meth->init(ret)) { + sk_void_free(ret->meth_data); OPENSSL_free(ret); ret = NULL; } diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl index e6acfd59f0d4..7140860e245b 100755 --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -2001,6 +2001,7 @@ $code.=<<___; push %r15 sub \$32*5+8, %rsp +.Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x mov $a_ptr, $b_ptr # backup copy movdqu 0x10($a_ptr), %xmm1 @@ -2291,6 +2292,7 @@ $code.=<<___; mov 0x40+8*1($b_ptr), $acc6 mov 0x40+8*2($b_ptr), $acc7 mov 0x40+8*3($b_ptr), $acc0 + movq $b_ptr, %xmm1 lea 0x40-$bias($b_ptr), $a_ptr lea $Z1sqr(%rsp), $r_ptr # Z1^2 @@ -2346,7 +2348,7 @@ $code.=<<___; test $acc0, $acc0 jnz .Ladd_proceed$x # (in1infty || in2infty)? test $acc1, $acc1 - jz .Ladd_proceed$x # is_equal(S1,S2)? + jz .Ladd_double$x # is_equal(S1,S2)? movq %xmm0, $r_ptr # restore $r_ptr pxor %xmm0, %xmm0 @@ -2359,6 +2361,13 @@ $code.=<<___; jmp .Ladd_done$x .align 32 +.Ladd_double$x: + movq %xmm1, $a_ptr # restore $a_ptr + movq %xmm0, $r_ptr # restore $r_ptr + add \$`32*(18-5)`, %rsp # difference in frame sizes + jmp .Lpoint_double_shortcut$x + +.align 32 .Ladd_proceed$x: `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 diff --git a/crypto/ec/ecp_nistp224.c b/crypto/ec/ecp_nistp224.c index ed09f97ade68..d81cc9ce6b1a 100644 --- a/crypto/ec/ecp_nistp224.c +++ b/crypto/ec/ecp_nistp224.c @@ -1657,8 +1657,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) */ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); - ret = 1; - goto err; + goto done; } if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) || @@ -1736,6 +1735,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) } make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems); + done: if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup, nistp224_pre_comp_free, nistp224_pre_comp_clear_free)) diff --git a/crypto/ec/ecp_nistp256.c b/crypto/ec/ecp_nistp256.c index a5887086c638..78d191aac7af 100644 --- a/crypto/ec/ecp_nistp256.c +++ b/crypto/ec/ecp_nistp256.c @@ -2249,8 +2249,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) */ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); - ret = 1; - goto err; + goto done; } if ((!BN_to_felem(x_tmp, &group->generator->X)) || (!BN_to_felem(y_tmp, &group->generator->Y)) || @@ -2337,6 +2336,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) } make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); + done: if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, nistp256_pre_comp_free, nistp256_pre_comp_clear_free)) diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c index 360b9a3516f6..c53a61bbfb69 100644 --- a/crypto/ec/ecp_nistp521.c +++ b/crypto/ec/ecp_nistp521.c @@ -2056,8 +2056,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) */ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); - ret = 1; - goto err; + goto done; } if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) || @@ -2115,6 +2114,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) } make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); + done: if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup, nistp521_pre_comp_free, nistp521_pre_comp_clear_free)) diff --git a/crypto/ec/ectest.c b/crypto/ec/ectest.c index efab0b07b1d2..40a1f003259f 100644 --- a/crypto/ec/ectest.c +++ b/crypto/ec/ectest.c @@ -1758,9 +1758,18 @@ static void nistp_single_test(const struct nistp_test_params *test) if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT; + /* + * We have not performed precomputation so have_precompute mult should be + * false + */ + if (EC_GROUP_have_precompute_mult(NISTP)) + ABORT; + /* now repeat all tests with precomputation */ if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT; + if (!EC_GROUP_have_precompute_mult(NISTP)) + ABORT; /* fixed point multiplication */ EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx); diff --git a/crypto/engine/eng_dyn.c b/crypto/engine/eng_dyn.c index 3169b09ad865..40f30e9d585e 100644 --- a/crypto/engine/eng_dyn.c +++ b/crypto/engine/eng_dyn.c @@ -243,8 +243,10 @@ static int dynamic_set_data_ctx(ENGINE *e, dynamic_data_ctx **ctx) * If we lost the race to set the context, c is non-NULL and *ctx is the * context of the thread that won. */ - if (c) + if (c) { + sk_OPENSSL_STRING_free(c->dirs); OPENSSL_free(c); + } return 1; } diff --git a/crypto/evp/e_des.c b/crypto/evp/e_des.c index aae13a675694..8ca65cd03ae1 100644 --- a/crypto/evp/e_des.c +++ b/crypto/evp/e_des.c @@ -71,12 +71,13 @@ typedef struct { DES_key_schedule ks; } ks; union { - void (*cbc) (const void *, void *, size_t, const void *, void *); + void (*cbc) (const void *, void *, size_t, + const DES_key_schedule *, unsigned char *); } stream; } EVP_DES_KEY; # if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__)) -/* ---------^^^ this is not a typo, just a way to detect that +/* ----------^^^ this is not a typo, just a way to detect that * assembler support was in general requested... */ # include "sparc_arch.h" @@ -86,9 +87,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[]; void des_t4_key_expand(const void *key, DES_key_schedule *ks); void des_t4_cbc_encrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule *ks, unsigned char iv[8]); void des_t4_cbc_decrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule *ks, unsigned char iv[8]); # endif static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, @@ -130,7 +131,7 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, { EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data; - if (dat->stream.cbc) { + if (dat->stream.cbc != NULL) { (*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv); return 1; } diff --git a/crypto/evp/e_des3.c b/crypto/evp/e_des3.c index bf6c1d2d3d39..0e910d6d8085 100644 --- a/crypto/evp/e_des3.c +++ b/crypto/evp/e_des3.c @@ -75,7 +75,8 @@ typedef struct { DES_key_schedule ks[3]; } ks; union { - void (*cbc) (const void *, void *, size_t, const void *, void *); + void (*cbc) (const void *, void *, size_t, + const DES_key_schedule *, unsigned char *); } stream; } DES_EDE_KEY; # define ks1 ks.ks[0] @@ -93,9 +94,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[]; void des_t4_key_expand(const void *key, DES_key_schedule *ks); void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule ks[3], unsigned char iv[8]); void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule ks[3], unsigned char iv[8]); # endif static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, @@ -162,7 +163,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, } # endif /* KSSL_DEBUG */ if (dat->stream.cbc) { - (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv); + (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv); return 1; } @@ -395,7 +396,7 @@ static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out, int rv = -1; if (inl < 24) return -1; - if (!out) + if (out == NULL) return inl - 16; memcpy(ctx->iv, wrap_iv, 8); /* Decrypt first block which will end up as icv */ @@ -438,7 +439,7 @@ static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) { unsigned char sha1tmp[SHA_DIGEST_LENGTH]; - if (!out) + if (out == NULL) return inl + 16; /* Copy input to output buffer + 8 so we have space for IV */ memmove(out + 8, in, inl); diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl index bd6bf72fe487..980cfd23efe3 100755 --- a/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -43,7 +43,7 @@ die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); + $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && @@ -489,7 +489,7 @@ $code.=<<___; ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 - movaps -0xd8(%rax),%xmm7 + movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 4ff2d39aa7b2..f889f2018789 100755 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -92,7 +92,7 @@ die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); + $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && diff --git a/crypto/modes/ctr128.c b/crypto/modes/ctr128.c index f3bbcbf72376..bcafd6b6bfb1 100644 --- a/crypto/modes/ctr128.c +++ b/crypto/modes/ctr128.c @@ -67,23 +67,20 @@ /* increment counter (128-bit int) by 1 */ static void ctr128_inc(unsigned char *counter) { - u32 n = 16; - u8 c; + u32 n = 16, c = 1; do { --n; - c = counter[n]; - ++c; - counter[n] = c; - if (c) - return; + c += counter[n]; + counter[n] = (u8)c; + c >>= 8; } while (n); } #if !defined(OPENSSL_SMALL_FOOTPRINT) static void ctr128_inc_aligned(unsigned char *counter) { - size_t *data, c, n; + size_t *data, c, d, n; const union { long one; char little; @@ -91,20 +88,19 @@ static void ctr128_inc_aligned(unsigned char *counter) 1 }; - if (is_endian.little) { + if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) { ctr128_inc(counter); return; } data = (size_t *)counter; + c = 1; n = 16 / sizeof(size_t); do { --n; - c = data[n]; - ++c; - data[n] = c; - if (c) - return; + d = data[n] += c; + /* did addition carry? */ + c = ((d - c) ^ d) >> (sizeof(size_t) * 8 - 1); } while (n); } #endif @@ -144,14 +140,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, } # if defined(STRICT_ALIGNMENT) - if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != - 0) + if (((size_t)in | (size_t)out | (size_t)ecount_buf) + % sizeof(size_t) != 0) break; # endif while (len >= 16) { (*block) (ivec, ecount_buf, key); ctr128_inc_aligned(ivec); - for (; n < 16; n += sizeof(size_t)) + for (n = 0; n < 16; n += sizeof(size_t)) *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n); len -= 16; @@ -189,16 +185,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, /* increment upper 96 bits of 128-bit counter by 1 */ static void ctr96_inc(unsigned char *counter) { - u32 n = 12; - u8 c; + u32 n = 12, c = 1; do { --n; - c = counter[n]; - ++c; - counter[n] = c; - if (c) - return; + c += counter[n]; + counter[n] = (u8)c; + c >>= 8; } while (n); } diff --git a/crypto/opensslconf.h b/crypto/opensslconf.h index b4d522e68505..f533508b152c 100644 --- a/crypto/opensslconf.h +++ b/crypto/opensslconf.h @@ -38,12 +38,18 @@ extern "C" { #ifndef OPENSSL_NO_SSL_TRACE # define OPENSSL_NO_SSL_TRACE #endif +#ifndef OPENSSL_NO_SSL2 +# define OPENSSL_NO_SSL2 +#endif #ifndef OPENSSL_NO_STORE # define OPENSSL_NO_STORE #endif #ifndef OPENSSL_NO_UNIT_TEST # define OPENSSL_NO_UNIT_TEST #endif +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS +# define OPENSSL_NO_WEAK_SSL_CIPHERS +#endif #endif /* OPENSSL_DOING_MAKEDEPEND */ @@ -86,12 +92,18 @@ extern "C" { # if defined(OPENSSL_NO_SSL_TRACE) && !defined(NO_SSL_TRACE) # define NO_SSL_TRACE # endif +# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2) +# define NO_SSL2 +# endif # if defined(OPENSSL_NO_STORE) && !defined(NO_STORE) # define NO_STORE # endif # if defined(OPENSSL_NO_UNIT_TEST) && !defined(NO_UNIT_TEST) # define NO_UNIT_TEST # endif +# if defined(OPENSSL_NO_WEAK_SSL_CIPHERS) && !defined(NO_WEAK_SSL_CIPHERS) +# define NO_WEAK_SSL_CIPHERS +# endif #endif /* crypto/opensslconf.h.in */ diff --git a/crypto/opensslv.h b/crypto/opensslv.h index 03b8c4843784..4334fd15cd87 100644 --- a/crypto/opensslv.h +++ b/crypto/opensslv.h @@ -30,11 +30,11 @@ extern "C" { * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for * major minor fix final patch/beta) */ -# define OPENSSL_VERSION_NUMBER 0x1000206fL +# define OPENSSL_VERSION_NUMBER 0x1000207fL # ifdef OPENSSL_FIPS -# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f-fips 28 Jan 2016" +# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016" # else -# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f 28 Jan 2016" +# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016" # endif # define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT diff --git a/crypto/perlasm/x86_64-xlate.pl b/crypto/perlasm/x86_64-xlate.pl index 9c70b8c2c6e9..ee04221c7e1d 100755 --- a/crypto/perlasm/x86_64-xlate.pl +++ b/crypto/perlasm/x86_64-xlate.pl @@ -198,8 +198,11 @@ my %globals; if ($gas) { # Solaris /usr/ccs/bin/as can't handle multiplications # in $self->{value} - $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi; - $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg; + my $value = $self->{value}; + $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi; + if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) { + $self->{value} = $value; + } sprintf "\$%s",$self->{value}; } else { $self->{value} =~ s/(0b[0-1]+)/oct($1)/eig; diff --git a/crypto/pkcs7/pk7_smime.c b/crypto/pkcs7/pk7_smime.c index c4d3724d2a48..dc9b484078af 100644 --- a/crypto/pkcs7/pk7_smime.c +++ b/crypto/pkcs7/pk7_smime.c @@ -274,12 +274,29 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store, PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_NO_CONTENT); return 0; } +#if 0 + /* + * NB: this test commented out because some versions of Netscape + * illegally include zero length content when signing data. Also + * Microsoft Authenticode includes a SpcIndirectDataContent data + * structure which describes the content to be protected by the + * signature, rather than directly embedding that content. So + * Authenticode implementations are also expected to use + * PKCS7_verify() with explicit external data, on non-detached + * PKCS#7 signatures. + * + * In OpenSSL 1.1 a new flag PKCS7_NO_DUAL_CONTENT has been + * introduced to disable this sanity check. For the 1.0.2 branch + * this change is not acceptable, so the check remains completely + * commented out (as it has been for a long time). + */ /* Check for data and content: two sets of data */ if (!PKCS7_get_detached(p7) && indata) { PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_CONTENT_AND_DATA_PRESENT); return 0; } +#endif sinfos = PKCS7_get_signer_info(p7); diff --git a/crypto/rsa/rsa_sign.c b/crypto/rsa/rsa_sign.c index ed63a1d8b0e3..82ca8324dfbc 100644 --- a/crypto/rsa/rsa_sign.c +++ b/crypto/rsa/rsa_sign.c @@ -84,7 +84,7 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len, return 0; } #endif - if (rsa->meth->rsa_sign) { + if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign) { return rsa->meth->rsa_sign(type, m, m_len, sigret, siglen, rsa); } /* Special case: SSL signature, just check the length */ @@ -293,7 +293,7 @@ int RSA_verify(int dtype, const unsigned char *m, unsigned int m_len, const unsigned char *sigbuf, unsigned int siglen, RSA *rsa) { - if (rsa->meth->rsa_verify) { + if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_verify) { return rsa->meth->rsa_verify(dtype, m, m_len, sigbuf, siglen, rsa); } diff --git a/crypto/srp/srp.h b/crypto/srp/srp.h index d072536fec9b..028892a1ff5e 100644 --- a/crypto/srp/srp.h +++ b/crypto/srp/srp.h @@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st { DECLARE_STACK_OF(SRP_gN_cache) typedef struct SRP_user_pwd_st { + /* Owned by us. */ char *id; BIGNUM *s; BIGNUM *v; + /* Not owned by us. */ const BIGNUM *g; const BIGNUM *N; + /* Owned by us. */ char *info; } SRP_user_pwd; DECLARE_STACK_OF(SRP_user_pwd) +void SRP_user_pwd_free(SRP_user_pwd *user_pwd); + typedef struct SRP_VBASE_st { STACK_OF(SRP_user_pwd) *users_pwd; STACK_OF(SRP_gN_cache) *gN_cache; @@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN) SRP_VBASE *SRP_VBASE_new(char *seed_key); int SRP_VBASE_free(SRP_VBASE *vb); int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file); + +/* This method ignores the configured seed and fails for an unknown user. */ SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username); +/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/ +SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username); + char *SRP_create_verifier(const char *user, const char *pass, char **salt, char **verifier, const char *N, const char *g); int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, diff --git a/crypto/srp/srp_vfy.c b/crypto/srp/srp_vfy.c index a3f1a8a0a4d5..26ad3e07b4bb 100644 --- a/crypto/srp/srp_vfy.c +++ b/crypto/srp/srp_vfy.c @@ -185,7 +185,7 @@ static char *t_tob64(char *dst, const unsigned char *src, int size) return olddst; } -static void SRP_user_pwd_free(SRP_user_pwd *user_pwd) +void SRP_user_pwd_free(SRP_user_pwd *user_pwd) { if (user_pwd == NULL) return; @@ -247,6 +247,24 @@ static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v) return (vinfo->s != NULL && vinfo->v != NULL); } +static SRP_user_pwd *srp_user_pwd_dup(SRP_user_pwd *src) +{ + SRP_user_pwd *ret; + + if (src == NULL) + return NULL; + if ((ret = SRP_user_pwd_new()) == NULL) + return NULL; + + SRP_user_pwd_set_gN(ret, src->g, src->N); + if (!SRP_user_pwd_set_ids(ret, src->id, src->info) + || !SRP_user_pwd_set_sv_BN(ret, BN_dup(src->s), BN_dup(src->v))) { + SRP_user_pwd_free(ret); + return NULL; + } + return ret; +} + SRP_VBASE *SRP_VBASE_new(char *seed_key) { SRP_VBASE *vb = (SRP_VBASE *)OPENSSL_malloc(sizeof(SRP_VBASE)); @@ -468,21 +486,50 @@ int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file) } -SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username) +static SRP_user_pwd *find_user(SRP_VBASE *vb, char *username) { int i; SRP_user_pwd *user; - unsigned char digv[SHA_DIGEST_LENGTH]; - unsigned char digs[SHA_DIGEST_LENGTH]; - EVP_MD_CTX ctxt; if (vb == NULL) return NULL; + for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) { user = sk_SRP_user_pwd_value(vb->users_pwd, i); if (strcmp(user->id, username) == 0) return user; } + + return NULL; +} + +/* + * This method ignores the configured seed and fails for an unknown user. + * Ownership of the returned pointer is not released to the caller. + * In other words, caller must not free the result. + */ +SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username) +{ + return find_user(vb, username); +} + +/* + * Ownership of the returned pointer is released to the caller. + * In other words, caller must free the result once done. + */ +SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username) +{ + SRP_user_pwd *user; + unsigned char digv[SHA_DIGEST_LENGTH]; + unsigned char digs[SHA_DIGEST_LENGTH]; + EVP_MD_CTX ctxt; + + if (vb == NULL) + return NULL; + + if ((user = find_user(vb, username)) != NULL) + return srp_user_pwd_dup(user); + if ((vb->seed_key == NULL) || (vb->default_g == NULL) || (vb->default_N == NULL)) return NULL; diff --git a/crypto/stack/stack.c b/crypto/stack/stack.c index de437acf6a5c..fa50083e22b3 100644 --- a/crypto/stack/stack.c +++ b/crypto/stack/stack.c @@ -360,7 +360,7 @@ void *sk_set(_STACK *st, int i, void *value) void sk_sort(_STACK *st) { - if (st && !st->sorted) { + if (st && !st->sorted && st->comp != NULL) { int (*comp_func) (const void *, const void *); /* diff --git a/crypto/x509/x509_vfy.c b/crypto/x509/x509_vfy.c index 0429767032fd..4d34dbac9314 100644 --- a/crypto/x509/x509_vfy.c +++ b/crypto/x509/x509_vfy.c @@ -194,6 +194,9 @@ int X509_verify_cert(X509_STORE_CTX *ctx) int num, j, retry; int (*cb) (int xok, X509_STORE_CTX *xctx); STACK_OF(X509) *sktmp = NULL; + int trust = X509_TRUST_UNTRUSTED; + int err; + if (ctx->cert == NULL) { X509err(X509_F_X509_VERIFY_CERT, X509_R_NO_CERT_SET_FOR_US_TO_VERIFY); return -1; @@ -216,7 +219,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (((ctx->chain = sk_X509_new_null()) == NULL) || (!sk_X509_push(ctx->chain, ctx->cert))) { X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - goto end; + ok = -1; + goto err; } CRYPTO_add(&ctx->cert->references, 1, CRYPTO_LOCK_X509); ctx->last_untrusted = 1; @@ -225,7 +229,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (ctx->untrusted != NULL && (sktmp = sk_X509_dup(ctx->untrusted)) == NULL) { X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - goto end; + ok = -1; + goto err; } num = sk_X509_num(ctx->chain); @@ -249,7 +254,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) { ok = ctx->get_issuer(&xtmp, ctx, x); if (ok < 0) - goto end; + goto err; /* * If successful for now free up cert so it will be picked up * again later. @@ -266,7 +271,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (xtmp != NULL) { if (!sk_X509_push(ctx->chain, xtmp)) { X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - goto end; + ok = -1; + goto err; } CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509); (void)sk_X509_delete_ptr(sktmp, xtmp); @@ -314,7 +320,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) bad_chain = 1; ok = cb(0, ctx); if (!ok) - goto end; + goto err; } else { /* * We have a match: replace certificate with store @@ -347,25 +353,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx) ok = ctx->get_issuer(&xtmp, ctx, x); if (ok < 0) - goto end; + goto err; if (ok == 0) break; x = xtmp; if (!sk_X509_push(ctx->chain, x)) { X509_free(xtmp); X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - ok = 0; - goto end; + ok = -1; + goto err; } num++; } /* we now have our chain, lets check it... */ - i = check_trust(ctx); + if ((trust = check_trust(ctx)) == X509_TRUST_REJECTED) { + /* Callback already issued */ + ok = 0; + goto err; + } - /* If explicitly rejected error */ - if (i == X509_TRUST_REJECTED) - goto end; /* * If it's not explicitly trusted then check if there is an alternative * chain that could be used. We only do this if we haven't already @@ -373,14 +380,14 @@ int X509_verify_cert(X509_STORE_CTX *ctx) * chain checking */ retry = 0; - if (i != X509_TRUST_TRUSTED + if (trust != X509_TRUST_TRUSTED && !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) && !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) { while (j-- > 1) { xtmp2 = sk_X509_value(ctx->chain, j - 1); ok = ctx->get_issuer(&xtmp, ctx, xtmp2); if (ok < 0) - goto end; + goto err; /* Check if we found an alternate chain */ if (ok > 0) { /* @@ -410,7 +417,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) * self signed certificate in which case we've indicated an error already * and set bad_chain == 1 */ - if (i != X509_TRUST_TRUSTED && !bad_chain) { + if (trust != X509_TRUST_TRUSTED && !bad_chain) { if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) { if (ctx->last_untrusted >= num) ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY; @@ -431,26 +438,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx) bad_chain = 1; ok = cb(0, ctx); if (!ok) - goto end; + goto err; } /* We have the chain complete: now we need to check its purpose */ ok = check_chain_extensions(ctx); if (!ok) - goto end; + goto err; /* Check name constraints */ ok = check_name_constraints(ctx); if (!ok) - goto end; + goto err; ok = check_id(ctx); if (!ok) - goto end; + goto err; /* We may as well copy down any DSA parameters that are required */ X509_get_pubkey_parameters(NULL, ctx->chain); @@ -462,16 +469,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx) ok = ctx->check_revocation(ctx); if (!ok) - goto end; + goto err; - i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain, - ctx->param->flags); - if (i != X509_V_OK) { - ctx->error = i; + err = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain, + ctx->param->flags); + if (err != X509_V_OK) { + ctx->error = err; ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth); ok = cb(0, ctx); if (!ok) - goto end; + goto err; } /* At this point, we have a chain and need to verify it */ @@ -480,25 +487,28 @@ int X509_verify_cert(X509_STORE_CTX *ctx) else ok = internal_verify(ctx); if (!ok) - goto end; + goto err; #ifndef OPENSSL_NO_RFC3779 /* RFC 3779 path validation, now that CRL check has been done */ ok = v3_asid_validate_path(ctx); if (!ok) - goto end; + goto err; ok = v3_addr_validate_path(ctx); if (!ok) - goto end; + goto err; #endif /* If we get this far evaluate policies */ if (!bad_chain && (ctx->param->flags & X509_V_FLAG_POLICY_CHECK)) ok = ctx->check_policy(ctx); if (!ok) - goto end; + goto err; if (0) { - end: + err: + /* Ensure we return an error */ + if (ok > 0) + ok = 0; X509_get_pubkey_parameters(NULL, ctx->chain); } if (sktmp != NULL) |