diff options
Diffstat (limited to 'lib/decompress')
-rw-r--r-- | lib/decompress/huf_decompress.c | 961 | ||||
-rw-r--r-- | lib/decompress/huf_decompress_amd64.S | 585 | ||||
-rw-r--r-- | lib/decompress/zstd_ddict.c | 2 | ||||
-rw-r--r-- | lib/decompress/zstd_ddict.h | 2 | ||||
-rw-r--r-- | lib/decompress/zstd_decompress.c | 354 | ||||
-rw-r--r-- | lib/decompress/zstd_decompress_block.c | 1038 | ||||
-rw-r--r-- | lib/decompress/zstd_decompress_block.h | 12 | ||||
-rw-r--r-- | lib/decompress/zstd_decompress_internal.h | 56 |
8 files changed, 2509 insertions, 501 deletions
diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index 141820671852..2027188255e1 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -1,7 +1,7 @@ /* ****************************************************************** * huff0 huffman decoder, * part of Finite State Entropy library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -22,6 +22,13 @@ #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "../common/error_private.h" +#include "../common/zstd_internal.h" + +/* ************************************************************** +* Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11 /* ************************************************************** * Macros @@ -36,6 +43,30 @@ #error "Cannot force the use of the X1 and X2 decoders at the same time!" #endif +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +#else +# define HUF_ASM_X86_64_BMI2_ATTRS +#endif + +#ifdef __cplusplus +# define HUF_EXTERN_C extern "C" +#else +# define HUF_EXTERN_C +#endif +#define HUF_ASM_DECL HUF_EXTERN_C + +#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_BMI2_FUNCTION 1 +#else +# define HUF_NEED_BMI2_FUNCTION 0 +#endif + +#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_DEFAULT_FUNCTION 1 +#else +# define HUF_NEED_DEFAULT_FUNCTION 0 +#endif /* ************************************************************** * Error Management @@ -65,7 +96,7 @@ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ } \ \ - static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ + static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ void* dst, size_t dstSize, \ const void* cSrc, size_t cSrcSize, \ const HUF_DTable* DTable) \ @@ -107,13 +138,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) return dtd; } +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +static size_t HUF_initDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; + size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); + return value << bitsConsumed; +} +typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; + BYTE const* ilimit; + BYTE* oend; + BYTE const* iend[4]; +} HUF_DecompressAsmArgs; + +/** + * Initializes args for the asm decoding loop. + * @returns 0 on success + * 1 if the fallback implementation should be used. + * Or an error code on failure. + */ +static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + + const BYTE* const ilimit = (const BYTE*)src + 6 + 8; + + BYTE* const oend = (BYTE*)dst + dstSize; + + /* The following condition is false on x32 platform, + * but HUF_asm is not compatible with this ABI */ + if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) + return ERROR(corruption_detected); + + /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. + * If table log is not correct at this point, fallback to the old decoder. + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) + return 1; + + /* Read the jump table. */ + { + const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = srcSize - (length1 + length2 + length3 + 6); + args->iend[0] = istart + 6; /* jumpTable */ + args->iend[1] = args->iend[0] + length1; + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + + /* HUF_initDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + * length1 must be >= 16 so that ip[0] >= ilimit before the loop + * starts. + */ + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) + return 1; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ + args->ip[0] = args->iend[1] - sizeof(U64); + args->ip[1] = args->iend[2] - sizeof(U64); + args->ip[2] = args->iend[3] - sizeof(U64); + args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); + + /* op[] contains the output pointers. */ + args->op[0] = (BYTE*)dst; + args->op[1] = args->op[0] + (dstSize+3)/4; + args->op[2] = args->op[1] + (dstSize+3)/4; + args->op[3] = args->op[2] + (dstSize+3)/4; + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) + return 1; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. + * It is shifted left as it is read, and zeros are + * shifted in. After the lowest valid bit a 1 is + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ + args->bits[0] = HUF_initDStream(args->ip[0]); + args->bits[1] = HUF_initDStream(args->ip[1]); + args->bits[2] = HUF_initDStream(args->ip[2]); + args->bits[3] = HUF_initDStream(args->ip[3]); + + /* If ip[] >= ilimit, it is guaranteed to be safe to + * reload bits[]. It may be beyond its section, but is + * guaranteed to be valid (>= istart). + */ + args->ilimit = ilimit; + + args->oend = oend; + args->dt = dt; + + return 0; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) +{ + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) + return ERROR(corruption_detected); + /* Validate that we haven't read beyond iend[]. + * Note that ip[] may be < iend[] because the MSB is + * the next bit to read, and we may have consumed 100% + * of the stream, so down to iend[i] - 8 is valid. + */ + if (args->ip[stream] < args->iend[stream] - 8) + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ + bit->bitContainer = MEM_readLE64(args->ip[stream]); + bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); + bit->start = (const char*)args->iend[0]; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; +} +#endif + #ifndef HUF_FORCE_DECOMPRESS_X2 /*-***************************/ /* single-symbol decoding */ /*-***************************/ -typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ /** * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at @@ -122,14 +287,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { U64 D4; if (MEM_isLittleEndian()) { - D4 = symbol + (nbBits << 8); - } else { D4 = (symbol << 8) + nbBits; + } else { + D4 = symbol + (nbBits << 8); } D4 *= 0x0001000100010001ULL; return D4; } +/** + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) +{ + if (tableLog > targetTableLog) + return tableLog; + if (tableLog < targetTableLog) { + U32 const scale = targetTableLog - tableLog; + U32 s; + /* Increase the weight for all non-zero probability symbols by scale. */ + for (s = 0; s < nbSymbols; ++s) { + huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); + } + /* Update rankVal to reflect the new weights. + * All weights except 0 get moved to weight + scale. + * Weights [1, scale] are empty. + */ + for (s = targetTableLog; s > scale; --s) { + rankVal[s] = rankVal[s - scale]; + } + for (s = scale; s > 0; --s) { + rankVal[s] = 0; + } + } + return targetTableLog; +} + typedef struct { U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; @@ -162,8 +357,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); if (HUF_isError(iSize)) return iSize; + /* Table header */ { DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog + 1; + U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); + tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ dtd.tableType = 0; dtd.tableLog = (BYTE)tableLog; @@ -207,7 +406,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr /* fill DTable * We fill all entries of each weight in order. - * That way length is a constant for each iteration of the outter loop. + * That way length is a constant for each iteration of the outer loop. * We can switch based on the length to a different inner loop which is * optimized for that particular case. */ @@ -304,11 +503,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons BYTE* const pStart = p; /* up to 4 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_1(p, bitDPtr); - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + if ((pEnd - p) > 3) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + } else { + BIT_reloadDStream(bitDPtr); } /* [0-3] symbols remaining */ @@ -388,33 +591,36 @@ HUF_decompress4X1_usingDTable_internal_body( U32 endSignal = 1; if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ - for ( ; (endSignal) & (op4 < olimit) ; ) { - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_1(op1, &bitD1); - HUF_DECODE_SYMBOLX1_1(op2, &bitD2); - HUF_DECODE_SYMBOLX1_1(op3, &bitD3); - HUF_DECODE_SYMBOLX1_1(op4, &bitD4); - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_0(op1, &bitD1); - HUF_DECODE_SYMBOLX1_0(op2, &bitD2); - HUF_DECODE_SYMBOLX1_0(op3, &bitD3); - HUF_DECODE_SYMBOLX1_0(op4, &bitD4); - endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + } } /* check corruption */ @@ -440,6 +646,79 @@ HUF_decompress4X1_usingDTable_internal_body( } } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret != 0) + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >= args.ilimit); + HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); + + /* Our loop guarantees that ip[] >= ilimit and that we haven't + * overwritten any op[]. + */ + assert(args.ip[0] >= iend); + assert(args.ip[1] >= iend); + assert(args.ip[2] >= iend); + assert(args.ip[3] >= iend); + assert(args.op[3] <= oend); + (void)iend; + + /* finish bit streams one by one. */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + /* Decompress and validate that we've produced exactly the expected length. */ + args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const void *cSrc, @@ -447,8 +726,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const HUF_DTable *DTable); HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -HUF_DGEN(HUF_decompress4X1_usingDTable_internal) +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} size_t HUF_decompress1X1_usingDTable( @@ -518,188 +817,315 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, /* *************************/ typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ -typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef struct { BYTE symbol; } sortedSymbol_t; typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; +/** + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + U32 seq; + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); + if (MEM_isLittleEndian()) { + seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); + return seq + (nbBits << 16) + ((U32)level << 24); + } else { + seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); + return (seq << 16) + (nbBits << 8) + (U32)level; + } +} + +/** + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + HUF_DEltX2 DElt; + U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); + ZSTD_memcpy(&DElt, &val, sizeof(val)); + return DElt; +} + +/** + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) +{ + U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + return (U64)DElt + ((U64)DElt << 32); +} + +/** + * Fills the DTable rank with all the symbols from [begin, end) that are each + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level == 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( + HUF_DEltX2* DTableRank, + sortedSymbol_t const* begin, sortedSymbol_t const* end, + U32 nbBits, U32 tableLog, + U16 baseSeq, int const level) +{ + U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); + const sortedSymbol_t* ptr; + assert(level >= 1 && level <= 2); + switch (length) { + case 1: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + *DTableRank++ = DElt; + } + break; + case 2: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + DTableRank[0] = DElt; + DTableRank[1] = DElt; + DTableRank += 2; + } + break; + case 4: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + DTableRank += 4; + } + break; + case 8: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + DTableRank += 8; + } + break; + default: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + HUF_DEltX2* const DTableRankEnd = DTableRank + length; + for (; DTableRank != DTableRankEnd; DTableRank += 8) { + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + } + } + break; + } +} /* HUF_fillDTableX2Level2() : * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ -static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, - const U32* rankValOrigin, const int minWeight, - const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, + const U32* rankVal, const int minWeight, const int maxWeight1, + const sortedSymbol_t* sortedSymbols, U32 const* rankStart, U32 nbBitsBaseline, U16 baseSeq) { - HUF_DEltX2 DElt; - U32 rankVal[HUF_TABLELOG_MAX + 1]; - - /* get pre-calculated rankVal */ - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal)); - - /* fill skipped values */ + /* Fill skipped values (all positions up to rankVal[minWeight]). + * These are positions only get a single symbol because the combined weight + * is too large. + */ if (minWeight>1) { - U32 i, skipSize = rankVal[minWeight]; - MEM_writeLE16(&(DElt.sequence), baseSeq); - DElt.nbBits = (BYTE)(consumed); - DElt.length = 1; - for (i = 0; i < skipSize; i++) - DTable[i] = DElt; + U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); + U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); + int const skipSize = rankVal[minWeight]; + assert(length > 1); + assert((U32)skipSize < length); + switch (length) { + case 2: + assert(skipSize == 1); + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); + break; + case 4: + assert(skipSize <= 4); + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); + break; + default: + { + int i; + for (i = 0; i < skipSize; i += 8) { + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); + } + } + } } - /* fill DTable */ - { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */ - const U32 symbol = sortedSymbols[s].symbol; - const U32 weight = sortedSymbols[s].weight; - const U32 nbBits = nbBitsBaseline - weight; - const U32 length = 1 << (sizeLog-nbBits); - const U32 start = rankVal[weight]; - U32 i = start; - const U32 end = start + length; - - MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8))); - DElt.nbBits = (BYTE)(nbBits + consumed); - DElt.length = 2; - do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */ - - rankVal[weight] += length; - } } + /* Fill each of the second level symbols by weight. */ + { + int w; + for (w = minWeight; w < maxWeight1; ++w) { + int const begin = rankStart[w]; + int const end = rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + U32 const totalBits = nbBits + consumedBits; + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedSymbols + begin, sortedSymbols + end, + totalBits, targetLog, + baseSeq, /* level */ 2); + } + } } - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, const U32 sortedListSize, + const sortedSymbol_t* sortedList, const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline) { - U32 rankVal[HUF_TABLELOG_MAX + 1]; + U32* const rankVal = rankValOrigin[0]; const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ const U32 minBits = nbBitsBaseline - maxWeight; - U32 s; - - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal)); - - /* fill DTable */ - for (s=0; s<sortedListSize; s++) { - const U16 symbol = sortedList[s].symbol; - const U32 weight = sortedList[s].weight; - const U32 nbBits = nbBitsBaseline - weight; - const U32 start = rankVal[weight]; - const U32 length = 1 << (targetLog-nbBits); - - if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */ - U32 sortedRank; + int w; + int const wEnd = (int)maxWeight + 1; + + /* Fill DTable in order of weight. */ + for (w = 1; w < wEnd; ++w) { + int const begin = (int)rankStart[w]; + int const end = (int)rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + + if (targetLog-nbBits >= minBits) { + /* Enough room for a second symbol. */ + int start = rankVal[w]; + U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); int minWeight = nbBits + scaleLog; + int s; if (minWeight < 1) minWeight = 1; - sortedRank = rankStart[minWeight]; - HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, - rankValOrigin[nbBits], minWeight, - sortedList+sortedRank, sortedListSize-sortedRank, - nbBitsBaseline, symbol); + /* Fill the DTable for every symbol of weight w. + * These symbols get at least 1 second symbol. + */ + for (s = begin; s != end; ++s) { + HUF_fillDTableX2Level2( + DTable + start, targetLog, nbBits, + rankValOrigin[nbBits], minWeight, wEnd, + sortedList, rankStart, + nbBitsBaseline, sortedList[s].symbol); + start += length; + } } else { - HUF_DEltX2 DElt; - MEM_writeLE16(&(DElt.sequence), symbol); - DElt.nbBits = (BYTE)(nbBits); - DElt.length = 1; - { U32 const end = start + length; - U32 u; - for (u = start; u < end; u++) DTable[u] = DElt; - } } - rankVal[weight] += length; + /* Only a single symbol. */ + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedList + begin, sortedList + end, + nbBits, targetLog, + /* baseSeq */ 0, /* level */ 1); + } } } +typedef struct { + rankValCol_t rankVal[HUF_TABLELOG_MAX]; + U32 rankStats[HUF_TABLELOG_MAX + 1]; + U32 rankStart0[HUF_TABLELOG_MAX + 3]; + sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; + BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; + U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +} HUF_ReadDTableX2_Workspace; + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) { - U32 tableLog, maxW, sizeOfSort, nbSymbols; + return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + U32 tableLog, maxW, nbSymbols; DTableDesc dtd = HUF_getDTableDesc(DTable); - U32 const maxTableLog = dtd.maxTableLog; + U32 maxTableLog = dtd.maxTableLog; size_t iSize; void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; U32 *rankStart; - rankValCol_t* rankVal; - U32* rankStats; - U32* rankStart0; - sortedSymbol_t* sortedSymbol; - BYTE* weightList; - size_t spaceUsed32 = 0; - - rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2; - rankStats = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_MAX + 1; - rankStart0 = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_MAX + 2; - sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t); - spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; - weightList = (BYTE *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; - - if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); - - rankStart = rankStart0 + 1; - ZSTD_memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace; + + if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC); + + rankStart = wksp->rankStart0 + 1; + ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats)); + ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0)); DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); if (HUF_isError(iSize)) return iSize; /* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; /* find maxWeight */ - for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ + for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ /* Get start index of each weight */ { U32 w, nextRankStart = 0; for (w=1; w<maxW+1; w++) { U32 curr = nextRankStart; - nextRankStart += rankStats[w]; + nextRankStart += wksp->rankStats[w]; rankStart[w] = curr; } rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ - sizeOfSort = nextRankStart; + rankStart[maxW+1] = nextRankStart; } /* sort symbols by weight */ { U32 s; for (s=0; s<nbSymbols; s++) { - U32 const w = weightList[s]; + U32 const w = wksp->weightList[s]; U32 const r = rankStart[w]++; - sortedSymbol[r].symbol = (BYTE)s; - sortedSymbol[r].weight = (BYTE)w; + wksp->sortedSymbol[r].symbol = (BYTE)s; } rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ } /* Build rankVal */ - { U32* const rankVal0 = rankVal[0]; + { U32* const rankVal0 = wksp->rankVal[0]; { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */ U32 nextRankVal = 0; U32 w; for (w=1; w<maxW+1; w++) { U32 curr = nextRankVal; - nextRankVal += rankStats[w] << (w+rescale); + nextRankVal += wksp->rankStats[w] << (w+rescale); rankVal0[w] = curr; } } { U32 const minBits = tableLog+1 - maxW; U32 consumed; for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { - U32* const rankValPtr = rankVal[consumed]; + U32* const rankValPtr = wksp->rankVal[consumed]; U32 w; for (w = 1; w < maxW+1; w++) { rankValPtr[w] = rankVal0[w] >> consumed; } } } } HUF_fillDTableX2(dt, maxTableLog, - sortedSymbol, sizeOfSort, - rankStart0, rankVal, maxW, + wksp->sortedSymbol, + wksp->rankStart0, wksp->rankVal, maxW, tableLog+1); dtd.tableLog = (BYTE)maxTableLog; @@ -713,7 +1139,7 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - ZSTD_memcpy(op, dt+val, 2); + ZSTD_memcpy(op, &dt[val].sequence, 2); BIT_skipBits(DStream, dt[val].nbBits); return dt[val].length; } @@ -722,15 +1148,17 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - ZSTD_memcpy(op, dt+val, 1); - if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); - else { + ZSTD_memcpy(op, &dt[val].sequence, 1); + if (dt[val].length==1) { + BIT_skipBits(DStream, dt[val].nbBits); + } else { if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { BIT_skipBits(DStream, dt[val].nbBits); if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); - } } + } + } return 1; } @@ -752,19 +1180,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, BYTE* const pStart = p; /* up to 8 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_1(p, bitDPtr); - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { + if (dtLog <= 11 && MEM_64bits()) { + /* up to 10 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } else { + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } + } else { + BIT_reloadDStream(bitDPtr); } /* closer to end : up to 2 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >= 2) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); - while (p <= pEnd-2) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + } if (p < pEnd) p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); @@ -798,7 +1244,6 @@ HUF_decompress1X2_usingDTable_internal_body( /* decoded size */ return dstSize; } - FORCE_INLINE_TEMPLATE size_t HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, @@ -840,57 +1285,60 @@ HUF_decompress4X2_usingDTable_internal_body( U32 const dtLog = dtd.tableLog; if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); /* 16-32 symbols per loop (4-8 symbols per stream) */ - for ( ; (endSignal) & (op4 < olimit); ) { + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit); ) { #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; #else - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal = (U32)LIKELY( - (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) - & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) - & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) - & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = (U32)LIKELY((U32) + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); #endif + } } /* check corruption */ @@ -914,8 +1362,99 @@ HUF_decompress4X2_usingDTable_internal_body( } } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret != 0) + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >= args.ilimit); + HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= iend); + assert(args.ip[1] >= iend); + assert(args.ip[2] >= iend); + assert(args.ip[3] >= iend); + assert(args.op[3] <= oend); + (void)iend; + + /* finish bitStreams one by one */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) + return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) -HUF_DGEN(HUF_decompress4X2_usingDTable_internal) size_t HUF_decompress1X2_usingDTable( void* dst, size_t dstSize, @@ -1024,25 +1563,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = { /* single, double, quad */ - {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ - {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ - {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ - {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ - {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ - {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ - {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ - {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ - {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ - {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ - {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ - {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ - {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ - {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ - {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ - {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ + {{0,0}, {1,1}}, /* Q==0 : impossible */ + {{0,0}, {1,1}}, /* Q==1 : impossible */ + {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ + {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ + {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ + {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ + {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ + {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ + {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ + {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ + {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ + {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ + {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ + {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ + {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ + {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ }; #endif @@ -1069,7 +1608,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) U32 const D256 = (U32)(dstSize >> 8); U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ return DTime1 < DTime0; } #endif @@ -1225,7 +1764,7 @@ size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cS HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize); } -#endif +#endif #ifndef HUF_FORCE_DECOMPRESS_X1 size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize) diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S new file mode 100644 index 000000000000..49589cb61141 --- /dev/null +++ b/lib/decompress/huf_decompress_amd64.S @@ -0,0 +1,585 @@ +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "../common/portability_macros.h" + +/* Stack marking + * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart + */ +#if defined(__ELF__) && defined(__GNUC__) +.section .note.GNU-stack,"",%progbits +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +/* Calling convention: + * + * %rdi contains the first argument: HUF_DecompressAsmArgs*. + * %rbp isn't maintained (no frame pointer). + * %rsp contains the stack pointer that grows down. + * No red-zone is assumed, only addresses >= %rsp are used. + * All register contents are preserved. + * + * TODO: Support Windows calling convention. + */ + +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) +.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.text + +/* Sets up register mappings for clarity. + * op[], bits[], dtable & ip[0] each get their own register. + * ip[1,2,3] & olimit alias var[]. + * %rax is a scratch register. + */ + +#define op0 rsi +#define op1 rbx +#define op2 rcx +#define op3 rdi + +#define ip0 r8 +#define ip1 r9 +#define ip2 r10 +#define ip3 r11 + +#define bits0 rbp +#define bits1 rdx +#define bits2 r12 +#define bits3 r13 +#define dtable r14 +#define olimit r15 + +/* var[] aliases ip[1,2,3] & olimit + * ip[1,2,3] are saved every iteration. + * olimit is only used in compute_olimit. + */ +#define var0 r15 +#define var1 r9 +#define var2 r10 +#define var3 r11 + +/* 32-bit var registers */ +#define vard0 r15d +#define vard1 r9d +#define vard2 r10d +#define vard3 r11d + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) + +/* Define both _HUF_* & HUF_* symbols because MacOS + * C symbols are prefixed with '_' & Linux symbols aren't. + */ +_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + /* Read HUF_DecompressAsmArgs* args from %rax */ + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push 104(%rax) /* ilimit */ + push 112(%rax) /* oend */ + push %olimit /* olimit space */ + + subq $24, %rsp + +.L_4X1_compute_olimit: + /* Computes how many iterations we can do safely + * %r15, %rax may be clobbered + * rbx, rdx must be saved + * op3 & ip0 mustn't be clobbered + */ + movq %rbx, 0(%rsp) + movq %rdx, 8(%rsp) + + movq 32(%rsp), %rax /* rax = oend */ + subq %op3, %rax /* rax = oend - op3 */ + + /* r15 = (oend - op3) / 5 */ + movabsq $-3689348814741910323, %rdx + mulq %rdx + movq %rdx, %r15 + shrq $2, %r15 + + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilimit */ + subq %rdx, %rax /* rax = ip0 - ilimit */ + movq %rax, %rbx /* rbx = ip0 - ilimit */ + + /* rdx = (ip0 - ilimit) / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %rbx + shrq %rbx + addq %rbx, %rdx + shrq $2, %rdx + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + /* r15 = r15 * 5 */ + leaq (%r15, %r15, 4), %r15 + + /* olimit = op3 + r15 */ + addq %op3, %olimit + + movq 8(%rsp), %rdx + movq 0(%rsp), %rbx + + /* If (op3 + 20 > olimit) */ + movq %op3, %rax /* rax = op3 */ + addq $20, %rax /* rax = op3 + 20 */ + cmpq %rax, %olimit /* op3 + 20 > olimit */ + jb .L_4X1_exit + + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X1_exit + + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X1_exit + + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X1_exit + +/* Reads top 11 bits from bits[n] + * Loads dt[bits[n]] into var[n] + */ +#define GET_NEXT_DELT(n) \ + movq $53, %var##n; \ + shrxq %var##n, %bits##n, %var##n; \ + movzwl (%dtable,%var##n,2),%vard##n + +/* var[n] must contain the DTable entry computed with GET_NEXT_DELT + * Moves var[n] to %rax + * bits[n] <<= var[n] & 63 + * op[n][idx] = %rax >> 8 + * %ah is a way to access bits [8, 16) of %rax + */ +#define DECODE_FROM_DELT(n, idx) \ + movq %var##n, %rax; \ + shlxq %var##n, %bits##n, %bits##n; \ + movb %ah, idx(%op##n) + +/* Assumes GET_NEXT_DELT has been called. + * Calls DECODE_FROM_DELT then GET_NEXT_DELT + */ +#define DECODE_AND_GET_NEXT(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + GET_NEXT_DELT(n) \ + +/* // ctz & nbBytes is stored in bits[n] + * // nbBits is stored in %rax + * ctz = CTZ[bits[n]] + * nbBits = ctz & 7 + * nbBytes = ctz >> 3 + * op[n] += 5 + * ip[n] -= nbBytes + * // Note: x86-64 is little-endian ==> no bswap + * bits[n] = MEM_readST(ip[n]) | 1 + * bits[n] <<= nbBits + */ +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + andq $7, %rax; \ + shrq $3, %bits##n; \ + leaq 5(%op##n), %op##n; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlx %rax, %bits##n, %bits##n + + /* Store clobbered variables on the stack */ + movq %olimit, 24(%rsp) + movq %ip1, 0(%rsp) + movq %ip2, 8(%rsp) + movq %ip3, 16(%rsp) + + /* Call GET_NEXT_DELT for each stream */ + FOR_EACH_STREAM(GET_NEXT_DELT) + + .p2align 6 + +.L_4X1_loop_body: + /* Decode 5 symbols in each of the 4 streams (20 total) + * Must have called GET_NEXT_DELT for each stream + */ + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) + + /* Load ip[1,2,3] from stack (var[] aliases them) + * ip[] is needed for RELOAD_BITS + * Each will be stored back to the stack after RELOAD + */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + /* Reload each stream & fetch the next table entry + * to prepare for the next iteration + */ + RELOAD_BITS(0) + GET_NEXT_DELT(0) + + RELOAD_BITS(1) + movq %ip1, 0(%rsp) + GET_NEXT_DELT(1) + + RELOAD_BITS(2) + movq %ip2, 8(%rsp) + GET_NEXT_DELT(2) + + RELOAD_BITS(3) + movq %ip3, 16(%rsp) + GET_NEXT_DELT(3) + + /* If op3 < olimit: continue the loop */ + cmp %op3, 24(%rsp) + ja .L_4X1_loop_body + + /* Reload ip[1,2,3] from stack */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + /* Re-compute olimit */ + jmp .L_4X1_compute_olimit + +#undef GET_NEXT_DELT +#undef DECODE_FROM_DELT +#undef DECODE +#undef RELOAD_BITS +.L_4X1_exit: + addq $24, %rsp + + /* Restore stack (oend & olimit) */ + pop %rax /* olimit */ + pop %rax /* oend */ + pop %rax /* ilimit */ + pop %rax /* arg */ + + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push %rax /* olimit */ + push 104(%rax) /* ilimit */ + + movq 112(%rax), %rax + push %rax /* oend3 */ + + movq %op3, %rax + push %rax /* oend2 */ + + movq %op2, %rax + push %rax /* oend1 */ + + movq %op1, %rax + push %rax /* oend0 */ + + /* Scratch space */ + subq $8, %rsp + +.L_4X2_compute_olimit: + /* Computes how many iterations we can do safely + * %r15, %rax may be clobbered + * rdx must be saved + * op[1,2,3,4] & ip0 mustn't be clobbered + */ + movq %rdx, 0(%rsp) + + /* We can consume up to 7 input bytes each iteration. */ + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilimit */ + subq %rdx, %rax /* rax = ip0 - ilimit */ + movq %rax, %r15 /* r15 = ip0 - ilimit */ + + /* rdx = rax / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %r15 + shrq %r15 + addq %r15, %rdx + shrq $2, %rdx + + /* r15 = (ip0 - ilimit) / 7 */ + movq %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 8(%rsp), %rax /* rax = oend0 */ + subq %op0, %rax /* rax = oend0 - op0 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 16(%rsp), %rax /* rax = oend1 */ + subq %op1, %rax /* rax = oend1 - op1 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 24(%rsp), %rax /* rax = oend2 */ + subq %op2, %rax /* rax = oend2 - op2 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 32(%rsp), %rax /* rax = oend3 */ + subq %op3, %rax /* rax = oend3 - op3 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + /* olimit = op3 + 5 * r15 */ + movq %r15, %rax + leaq (%op3, %rax, 4), %olimit + addq %rax, %olimit + + movq 0(%rsp), %rdx + + /* If (op3 + 10 > olimit) */ + movq %op3, %rax /* rax = op3 */ + addq $10, %rax /* rax = op3 + 10 */ + cmpq %rax, %olimit /* op3 + 10 > olimit */ + jb .L_4X2_exit + + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X2_exit + + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X2_exit + + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X2_exit + +#define DECODE(n, idx) \ + movq %bits##n, %rax; \ + shrq $53, %rax; \ + movzwl 0(%dtable,%rax,4),%r8d; \ + movzbl 2(%dtable,%rax,4),%r15d; \ + movzbl 3(%dtable,%rax,4),%eax; \ + movw %r8w, (%op##n); \ + shlxq %r15, %bits##n, %bits##n; \ + addq %rax, %op##n + +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + shrq $3, %bits##n; \ + andq $7, %rax; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlxq %rax, %bits##n, %bits##n + + + movq %olimit, 48(%rsp) + + .p2align 6 + +.L_4X2_loop_body: + /* We clobber r8, so store it on the stack */ + movq %r8, 0(%rsp) + + /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + + /* Reload r8 */ + movq 0(%rsp), %r8 + + FOR_EACH_STREAM(RELOAD_BITS) + + cmp %op3, 48(%rsp) + ja .L_4X2_loop_body + jmp .L_4X2_compute_olimit + +#undef DECODE +#undef RELOAD_BITS +.L_4X2_exit: + addq $8, %rsp + /* Restore stack (oend & olimit) */ + pop %rax /* oend0 */ + pop %rax /* oend1 */ + pop %rax /* oend2 */ + pop %rax /* oend3 */ + pop %rax /* ilimit */ + pop %rax /* olimit */ + pop %rax /* arg */ + + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +#endif diff --git a/lib/decompress/zstd_ddict.c b/lib/decompress/zstd_ddict.c index f5cc23b387c4..ce335477b32d 100644 --- a/lib/decompress/zstd_ddict.c +++ b/lib/decompress/zstd_ddict.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/decompress/zstd_ddict.h b/lib/decompress/zstd_ddict.h index 8906a71c9404..bd03268b5087 100644 --- a/lib/decompress/zstd_ddict.h +++ b/lib/decompress/zstd_ddict.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 21f846bc77e7..0031e98cfb1a 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -56,12 +56,12 @@ * Dependencies *********************************************************/ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ -#include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" +#include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */ #include "../common/zstd_internal.h" /* blockProperties_t */ #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ @@ -72,6 +72,147 @@ #endif + +/************************************* + * Multiple DDicts Hashset internals * + *************************************/ + +#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. + * Currently, that means a 0.75 load factor. + * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded + * the load factor of the ddict hash set. + */ + +#define DDICT_HASHSET_TABLE_BASE_SIZE 64 +#define DDICT_HASHSET_RESIZE_FACTOR 2 + +/* Hash function to determine starting position of dict insertion within the table + * Returns an index between [0, hashSet->ddictPtrTableSize] + */ +static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) { + const U64 hash = XXH64(&dictID, sizeof(U32), 0); + /* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */ + return hash & (hashSet->ddictPtrTableSize - 1); +} + +/* Adds DDict to a hashset without resizing it. + * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set. + * Returns 0 if successful, or a zstd error code if something went wrong. + */ +static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) { + const U32 dictID = ZSTD_getDictID_fromDDict(ddict); + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!"); + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + while (hashSet->ddictPtrTable[idx] != NULL) { + /* Replace existing ddict if inserting ddict with same dictID */ + if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) { + DEBUGLOG(4, "DictID already exists, replacing rather than adding"); + hashSet->ddictPtrTable[idx] = ddict; + return 0; + } + idx &= idxRangeMask; + idx++; + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + hashSet->ddictPtrTable[idx] = ddict; + hashSet->ddictPtrCount++; + return 0; +} + +/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and + * rehashes all values, allocates new table, frees old table. + * Returns 0 on success, otherwise a zstd error code. + */ +static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR; + const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem); + const ZSTD_DDict** oldTable = hashSet->ddictPtrTable; + size_t oldTableSize = hashSet->ddictPtrTableSize; + size_t i; + + DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize); + RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!"); + hashSet->ddictPtrTable = newTable; + hashSet->ddictPtrTableSize = newTableSize; + hashSet->ddictPtrCount = 0; + for (i = 0; i < oldTableSize; ++i) { + if (oldTable[i] != NULL) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), ""); + } + } + ZSTD_customFree((void*)oldTable, customMem); + DEBUGLOG(4, "Finished re-hash"); + return 0; +} + +/* Fetches a DDict with the given dictID + * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL. + */ +static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) { + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + for (;;) { + size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]); + if (currDictID == dictID || currDictID == 0) { + /* currDictID == 0 implies a NULL ddict entry */ + break; + } else { + idx &= idxRangeMask; /* Goes to start of table when we reach the end */ + idx++; + } + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + return hashSet->ddictPtrTable[idx]; +} + +/* Allocates space for and returns a ddict hash set + * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with. + * Returns NULL if allocation failed. + */ +static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) { + ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem); + DEBUGLOG(4, "Allocating new hash set"); + if (!ret) + return NULL; + ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem); + if (!ret->ddictPtrTable) { + ZSTD_customFree(ret, customMem); + return NULL; + } + ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; + ret->ddictPtrCount = 0; + return ret; +} + +/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself. + * Note: The ZSTD_DDict* within the table are NOT freed. + */ +static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + DEBUGLOG(4, "Freeing ddict hash set"); + if (hashSet && hashSet->ddictPtrTable) { + ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem); + } + if (hashSet) { + ZSTD_customFree(hashSet, customMem); + } +} + +/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set. + * Returns 0 on success, or a ZSTD error. + */ +static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) { + DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize); + if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), ""); + } + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), ""); + return 0; +} + /*-************************************************************* * Context management ***************************************************************/ @@ -101,6 +242,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; dctx->outBufferMode = ZSTD_bm_buffered; dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; } static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) @@ -115,13 +257,17 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) dctx->inBuffSize = 0; dctx->outBuffSize = 0; dctx->streamStage = zdss_init; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) dctx->legacyContext = NULL; dctx->previousLegacyVersion = 0; +#endif dctx->noForwardProgress = 0; dctx->oversizedDuration = 0; - dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); +#endif + dctx->ddictSet = NULL; ZSTD_DCtx_resetParameters(dctx); - dctx->validateChecksum = 1; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION dctx->dictContentEndForFuzzing = NULL; #endif @@ -140,8 +286,7 @@ ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) return dctx; } -ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) -{ +static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem); @@ -152,10 +297,15 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) } } +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_internal(customMem); +} + ZSTD_DCtx* ZSTD_createDCtx(void) { DEBUGLOG(3, "ZSTD_createDCtx"); - return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } static void ZSTD_clearDict(ZSTD_DCtx* dctx) @@ -178,6 +328,10 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) if (dctx->legacyContext) ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); #endif + if (dctx->ddictSet) { + ZSTD_freeDDictHashSet(dctx->ddictSet, cMem); + dctx->ddictSet = NULL; + } ZSTD_customFree(dctx, cMem); return 0; } @@ -190,6 +344,29 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) ZSTD_memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ } +/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on + * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then + * accordingly sets the ddict to be used to decompress the frame. + * + * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is. + * + * ZSTD_d_refMultipleDDicts must be enabled for this function to be called. + */ +static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) { + assert(dctx->refMultipleDDicts && dctx->ddictSet); + DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame"); + if (dctx->ddict) { + const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID); + if (frameDDict) { + DEBUGLOG(4, "DDict found!"); + ZSTD_clearDict(dctx); + dctx->dictID = dctx->fParams.dictID; + dctx->ddict = frameDDict; + dctx->dictUses = ZSTD_use_indefinitely; + } + } +} + /*-************************************************************* * Frame header decoding @@ -213,6 +390,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) return 0; } +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } + return 0; +} + /** ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. @@ -299,7 +489,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s } switch(dictIDSizeCode) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : break; case 1 : dictID = ip[pos]; pos++; break; case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; @@ -307,7 +499,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s } switch(fcsID) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : if (singleSegment) frameContentSize = ip[pos]; break; case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; case 2 : frameContentSize = MEM_readLE32(ip+pos); break; @@ -336,7 +530,6 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); } - /** ZSTD_getFrameContentSize() : * compatible with legacy mode * @return : decompressed size of the single frame pointed to be `src` if known, otherwise @@ -377,6 +570,37 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) } } +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, + const void* src, size_t srcSize) +{ + U32 const magicNumber = MEM_readLE32(src); + size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); + size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; + + /* check input validity */ + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); + + /* deliver payload */ + if (skippableContentSize > 0 && dst != NULL) + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); + if (magicVariant != NULL) + *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; + return skippableContentSize; +} + /** ZSTD_findDecompressedSize() : * compatible with legacy mode * `srcSize` must be the exact length of some number of ZSTD compressed and/or @@ -441,12 +665,19 @@ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) /** ZSTD_decodeFrameHeader() : * `headerSize` must be the size provided by ZSTD_frameHeaderSize(). + * If multiple DDict references are enabled, also will choose the correct DDict to use. * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) { size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format); if (ZSTD_isError(result)) return result; /* invalid header */ RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small"); + + /* Reference DDict requested by frame if dctx references multiple ddicts */ + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) { + ZSTD_DCtx_selectFrameDDict(dctx); + } + #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION /* Skip the dictID check in fuzzing mode, because it makes the search * harder. @@ -456,6 +687,7 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t he #endif dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0; if (dctx->validateChecksum) XXH64_reset(&dctx->xxhState, 0); + dctx->processedCSize += headerSize; return 0; } @@ -578,7 +810,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) { DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize); - ZSTD_checkContinuity(dctx, blockStart); + ZSTD_checkContinuity(dctx, blockStart, blockSize); dctx->previousDstEnd = (const char*)blockStart + blockSize; return blockSize; } @@ -610,6 +842,32 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, return regenSize; } +static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming) +{ +#if ZSTD_TRACE + if (dctx->traceCtx && ZSTD_trace_decompress_end != NULL) { + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + if (dctx->ddict) { + trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict); + trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict); + trace.dictionaryIsCold = dctx->ddictIsCold; + } + trace.uncompressedSize = (size_t)uncompressedSize; + trace.compressedSize = (size_t)compressedSize; + trace.dctx = dctx; + ZSTD_trace_decompress_end(dctx->traceCtx, &trace); + } +#else + (void)dctx; + (void)uncompressedSize; + (void)compressedSize; + (void)streaming; +#endif +} + /*! ZSTD_decompressFrame() : * @dctx must be properly initialized @@ -619,8 +877,9 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void** srcPtr, size_t *srcSizePtr) { - const BYTE* ip = (const BYTE*)(*srcPtr); - BYTE* const ostart = (BYTE* const)dst; + const BYTE* const istart = (const BYTE*)(*srcPtr); + const BYTE* ip = istart; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart; BYTE* op = ostart; size_t remainingSrcSize = *srcSizePtr; @@ -656,7 +915,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, switch(blockProperties.blockType) { case bt_compressed: - decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); break; case bt_raw : decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); @@ -695,7 +954,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, ip += 4; remainingSrcSize -= 4; } - + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); /* Allow caller to get size read */ *srcPtr = ip; *srcSizePtr = remainingSrcSize; @@ -764,7 +1023,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, * use this in all cases but ddict */ FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), ""); } - ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize); @@ -807,7 +1066,7 @@ static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx) switch (dctx->dictUses) { default: assert(0 /* Impossible */); - /* fall-through */ + ZSTD_FALLTHROUGH; case ZSTD_dont_use: ZSTD_clearDict(dctx); return NULL; @@ -829,7 +1088,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr { #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) size_t regenSize; - ZSTD_DCtx* const dctx = ZSTD_createDCtx(); + ZSTD_DCtx* const dctx = ZSTD_createDCtx_internal(ZSTD_defaultCMem); RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); ZSTD_freeDCtx(dctx); @@ -863,7 +1122,7 @@ static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t return dctx->expected; if (dctx->bType != bt_raw) return dctx->expected; - return MIN(MAX(inputSize, 1), dctx->expected); + return BOUNDED(1, inputSize, dctx->expected); } ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { @@ -871,7 +1130,9 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { { default: /* should not happen */ assert(0); + ZSTD_FALLTHROUGH; case ZSTDds_getFrameHeaderSize: + ZSTD_FALLTHROUGH; case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader; case ZSTDds_decodeBlockHeader: @@ -883,6 +1144,7 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { case ZSTDds_checkChecksum: return ZSTDnit_checksum; case ZSTDds_decodeSkippableHeader: + ZSTD_FALLTHROUGH; case ZSTDds_skipFrame: return ZSTDnit_skippableFrame; } @@ -899,7 +1161,9 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize); /* Sanity check */ RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed"); - if (dstCapacity) ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); + + dctx->processedCSize += srcSize; switch (dctx->stage) { @@ -964,7 +1228,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c { case bt_compressed: DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); - rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); dctx->expected = 0; /* Streaming not supported */ break; case bt_raw : @@ -1004,6 +1268,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c dctx->expected = 4; dctx->stage = ZSTDds_checkChecksum; } else { + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); dctx->expected = 0; /* ends here */ dctx->stage = ZSTDds_getFrameHeaderSize; } @@ -1023,6 +1288,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); RETURN_ERROR_IF(check32 != h32, checksum_wrong, ""); } + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); dctx->expected = 0; dctx->stage = ZSTDds_getFrameHeaderSize; return 0; @@ -1176,8 +1442,12 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) { assert(dctx != NULL); +#if ZSTD_TRACE + dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0; +#endif dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->processedCSize = 0; dctx->decodedSize = 0; dctx->previousDstEnd = NULL; dctx->prefixStart = NULL; @@ -1283,7 +1553,7 @@ size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, ZSTD_DStream* ZSTD_createDStream(void) { DEBUGLOG(3, "ZSTD_createDStream"); - return ZSTD_createDStream_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) @@ -1293,7 +1563,7 @@ ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) { - return ZSTD_createDCtx_advanced(customMem); + return ZSTD_createDCtx_internal(customMem); } size_t ZSTD_freeDStream(ZSTD_DStream* zds) @@ -1391,6 +1661,16 @@ size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) if (ddict) { dctx->ddict = ddict; dctx->dictUses = ZSTD_use_indefinitely; + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) { + if (dctx->ddictSet == NULL) { + dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem); + if (!dctx->ddictSet) { + RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!"); + } + } + assert(!dctx->staticSize); /* Impossible: ddictSet cannot have been allocated if static dctx */ + FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), ""); + } } return 0; } @@ -1436,6 +1716,10 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) bounds.lowerBound = (int)ZSTD_d_validateChecksum; bounds.upperBound = (int)ZSTD_d_ignoreChecksum; return bounds; + case ZSTD_d_refMultipleDDicts: + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; default:; } bounds.error = ERROR(parameter_unsupported); @@ -1473,6 +1757,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value case ZSTD_d_forceIgnoreChecksum: *value = (int)dctx->forceIgnoreChecksum; return 0; + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1499,6 +1786,13 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value); dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value; return 0; + case ZSTD_d_refMultipleDDicts: + CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value); + if (dctx->staticSize != 0) { + RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!"); + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1529,7 +1823,8 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) { size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); - unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); + /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ + unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); size_t const minRBSize = (size_t) neededSize; RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, @@ -1663,10 +1958,12 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB DEBUGLOG(5, "stage zdss_init => transparent reset "); zds->streamStage = zdss_loadHeader; zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) zds->legacyVersion = 0; +#endif zds->hostageByte = 0; zds->expectedOutBuffer = *output; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_loadHeader : DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); @@ -1680,6 +1977,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB } } #endif { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } DEBUGLOG(5, "header size : %u", (U32)hSize); if (ZSTD_isError(hSize)) { #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) @@ -1801,7 +2101,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB zds->outBuffSize = neededOutBuffSize; } } } zds->streamStage = zdss_read; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_read: DEBUGLOG(5, "stage zdss_read"); @@ -1820,7 +2120,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB } } if (ip==iend) { someMoreWork = 0; break; } /* no more input */ zds->streamStage = zdss_load; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_load: { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 19cbdc5c16eb..2e44d30d2f37 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, } } +/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ +static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) +{ + if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) + { + /* room for litbuffer to fit without read faulting */ + dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; + } + else if (litSize > ZSTD_LITBUFFEREXTRASIZE) + { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; + } + else { + /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; + } + else + { + /* fits entirely within litExtraBuffer, so no split is necessary */ + dctx->litBuffer = dctx->litExtraBuffer; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +} /* Hidden declaration for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize); + const void* src, size_t srcSize, + void* dst, size_t dstCapacity, const streaming_operation streaming); /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current + * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. + * * @return : nb of bytes read from src (< srcSize ) * note : symbol not declared but exposed for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) { DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); @@ -90,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, case set_repeat: DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); - /* fall-through */ + ZSTD_FALLTHROUGH; case set_compressed: RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); @@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, U32 const lhlCode = (istart[0] >> 2) & 3; U32 const lhc = MEM_readLE32(istart); size_t hufSuccess; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); switch(lhlCode) { case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); /* prefetch huffman table if cold */ if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { @@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, if (singleStream) { hufSuccess = HUF_decompress1X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCSize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } else { hufSuccess = HUF_decompress4X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCSize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } } else { if (singleStream) { @@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); #endif } else { hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); } } + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; + } RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); @@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, dctx->litSize = litSize; dctx->litEntropy = 1; if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return litCSize + lhSize; } case set_basic: { size_t litSize, lhSize; U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); - ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); + } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return lhSize+litSize; } /* direct reference into compressed stream */ dctx->litPtr = istart+lhSize; dctx->litSize = litSize; + dctx->litBufferEnd = dctx->litPtr + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; return lhSize+litSize; } case set_rle: { U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t litSize, lhSize; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); - ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); + } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; return lhSize+1; @@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { }; /* ML_defaultDTable */ -static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits) +static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) { void* ptr = dt; ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; @@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB cell->nbBits = 0; cell->nextState = 0; assert(nbAddBits < 255); - cell->nbAdditionalBits = (BYTE)nbAddBits; + cell->nbAdditionalBits = nbAddBits; cell->baseValue = baseValue; } @@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB FORCE_INLINE_TEMPLATE void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode = dt+1; @@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) ); tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); assert(nbAdditionalBits[symbol] < 255); - tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol]; + tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; tableDecode[u].baseValue = baseValue[symbol]; } } @@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, /* Avoids the FORCE_INLINE of the _body() function. */ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, } #if DYNAMIC_BMI2 -TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, +BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) { #if DYNAMIC_BMI2 @@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, symbolEncodingType_e type, unsigned max, U32 maxLog, const void* src, size_t srcSize, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, int bmi2) @@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); { U32 const symbol = *(const BYTE*)src; U32 const baseline = baseValue[symbol]; - U32 const nbBits = nbAdditionalBits[symbol]; + U8 const nbBits = nbAdditionalBits[symbol]; ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); } *DTablePtr = DTableSpace; @@ -577,7 +653,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; int nbSeq; @@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, LL_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += llhSize; } @@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, OF_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += ofhSize; } @@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, ML_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += mlhSize; } @@ -658,7 +734,6 @@ typedef struct { size_t litLength; size_t matchLength; size_t offset; - const BYTE* match; } seq_t; typedef struct { @@ -672,9 +747,6 @@ typedef struct { ZSTD_fseState stateOffb; ZSTD_fseState stateML; size_t prevOffset[ZSTD_REP_NUM]; - const BYTE* prefixStart; - const BYTE* dictEnd; - size_t pos; } seqState_t; /*! ZSTD_overlapCopy8() : @@ -717,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. * The src buffer must be before the dst buffer. */ -static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { +static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { ptrdiff_t const diff = op - ip; BYTE* const oend = op + length; @@ -733,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ assert(length >= 8); ZSTD_overlapCopy8(&op, &ip, diff); + length -= 8; assert(op - ip >= 8); assert(op <= oend); } @@ -747,8 +820,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ assert(oend > oend_w); ZSTD_wildcopy(op, ip, oend_w - op, ovtype); ip += oend_w - op; - op = oend_w; + op += oend_w - op; + } + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + +/* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + if (length < 8 || diff > -8) { + /* Handle short lengths, close overlaps, and dst not before src. */ + while (op < oend) *op++ = *ip++; + return; + } + + if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { + ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); + ip += oend - WILDCOPY_OVERLENGTH - op; + op += oend - WILDCOPY_OVERLENGTH - op; } + /* Handle the leftovers. */ while (op < oend) *op++ = *ip++; } @@ -763,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ */ FORCE_NOINLINE size_t ZSTD_execSequenceEnd(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -788,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op, if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); - match = dictEnd - (prefixStart-match); + match = dictEnd - (prefixStart - match); if (match + sequence.matchLength <= dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; - ZSTD_memmove(oLitEnd, match, length1); - op = oLitEnd + length1; - sequence.matchLength -= length1; - match = prefixStart; - } } + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +/* ZSTD_execSequenceEndSplitLitBuffer(): + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ +FORCE_NOINLINE +size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); + ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart - match); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); return sequenceLength; } HINT_INLINE size_t ZSTD_execSequence(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -825,10 +970,102 @@ size_t ZSTD_execSequence(BYTE* op, * - 32-bit mode and the match length overflows */ if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + +HINT_INLINE +size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( iLitEnd > litLimit || oMatchEnd > oend_w || (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) - return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ assert(op <= oLitEnd /* No overflow */); @@ -896,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op, return sequenceLength; } + static void ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) { @@ -909,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS } FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) -{ - ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; - U32 const nbBits = DInfo.nbBits; - size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; -} - -FORCE_INLINE_TEMPLATE void -ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) { - U32 const nbBits = DInfo.nbBits; size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; + DStatePtr->state = nextState + lowBits; } /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum @@ -936,116 +1164,105 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD : 0) typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; -typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e; FORCE_INLINE_TEMPLATE seq_t -ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch) +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) { seq_t seq; - ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; - ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; - ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; - U32 const llBase = llDInfo.baseValue; - U32 const mlBase = mlDInfo.baseValue; - U32 const ofBase = ofDInfo.baseValue; - BYTE const llBits = llDInfo.nbAdditionalBits; - BYTE const mlBits = mlDInfo.nbAdditionalBits; - BYTE const ofBits = ofDInfo.nbAdditionalBits; - BYTE const totalBits = llBits+mlBits+ofBits; - - /* sequence */ - { size_t offset; - if (ofBits > 1) { - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { - U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); - assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ - } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } else { - U32 const ll0 = (llBase == 0); - if (LIKELY((ofBits == 0))) { - if (LIKELY(!ll0)) - offset = seqState->prevOffset[0]; - else { - offset = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likelyness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; + #if defined(__clang__) + if (LIKELY(ofBits > 1)) { + #else + if (ofBits > 1) { + #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; } else { - offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); - { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } } } - seq.offset = offset; - } - - seq.matchLength = mlBase; - if (mlBits > 0) - seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); - - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - - seq.litLength = llBase; - if (llBits > 0) - seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); - - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - - if (prefetch == ZSTD_p_prefetch) { - size_t const pos = seqState->pos + seq.litLength; - const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; - seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. - * No consequence though : no memory access will occur, offset is only used for prefetching */ - seqState->pos = pos + seq.matchLength; - } - - /* ANS state update - * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). - * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). - * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the - * better option, so it is the default for other compilers. But, if you - * measure that it is worse, please put up a pull request. - */ - { -#if defined(__GNUC__) && !defined(__clang__) - const int kUseUpdateFseState = 1; -#else - const int kUseUpdateFseState = 0; -#endif - if (kUseUpdateFseState) { - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - } else { - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + offset = seqState->prevOffset[ll0]; + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; + seqState->prevOffset[0] = offset; + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; } + + #if defined(__clang__) + if (UNLIKELY(mlBits > 0)) + #else + if (mlBits > 0) + #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + #if defined(__clang__) + if (UNLIKELY(llBits > 0)) + #else + if (llBits > 0) + #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ } return seq; @@ -1098,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence( #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + + FORCE_INLINE_TEMPLATE size_t DONT_VECTORIZE -ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, +ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset, @@ -1108,21 +1327,20 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - DEBUGLOG(5, "ZSTD_decompressSequences_body"); + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); (void)frame; /* Regen sequences */ if (nbSeq) { seqState_t seqState; - size_t error = 0; dctx->fseEntropy = 1; { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } RETURN_ERROR_IF( @@ -1138,70 +1356,255 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, BIT_DStream_endOfBuffer < BIT_DStream_completed && BIT_DStream_completed < BIT_DStream_overflow); + /* decompress without overrunning litPtr begins */ + { + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression + * speed swings based on the alignment of the decompression loop. This + * performance swing is caused by parts of the decompression loop falling + * out of the DSB. The entire decompression loop should fit in the DSB, + * when it can't we get much worse performance. You can measure if you've + * hit the good case or the bad case with this perf command for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst + * + * If you see most cycles served out of the MITE you've hit the bad case. + * If you see most cycles served out of the DSB you've hit the good case. + * If it is pretty even then you may be in an okay case. + * + * This issue has been reproduced on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 + * Use Instruments->Counters to get DSB/MITE cycles. + * I never got performance swings, but I was able to + * go from the good case of mostly DSB to half of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k + * + * I haven't been able to reproduce the instability or DSB misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * Alignment is done for each of the three major decompression loops: + * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer + * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer + * - ZSTD_decompressSequences_body + * Alignment choices are made to minimize large swings on bad cases and influence on performance + * from changes external to this code, rather than to overoptimize on the current commit. + * + * If you are seeing performance stability this script can help test. + * It tests on 4 commits in zstd where I saw performance change. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 + */ #if defined(__GNUC__) && defined(__x86_64__) - /* Align the decompression loop to 32 + 16 bytes. - * - * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression - * speed swings based on the alignment of the decompression loop. This - * performance swing is caused by parts of the decompression loop falling - * out of the DSB. The entire decompression loop should fit in the DSB, - * when it can't we get much worse performance. You can measure if you've - * hit the good case or the bad case with this perf command for some - * compressed file test.zst: - * - * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ - * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst - * - * If you see most cycles served out of the MITE you've hit the bad case. - * If you see most cycles served out of the DSB you've hit the good case. - * If it is pretty even then you may be in an okay case. - * - * I've been able to reproduce this issue on the following CPUs: - * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 - * Use Instruments->Counters to get DSB/MITE cycles. - * I never got performance swings, but I was able to - * go from the good case of mostly DSB to half of the - * cycles served from MITE. - * - Coffeelake: Intel i9-9900k - * - * I haven't been able to reproduce the instability or DSB misses on any - * of the following CPUS: - * - Haswell - * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH - * - Skylake - * - * If you are seeing performance stability this script can help test. - * It tests on 4 commits in zstd where I saw performance change. - * - * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 - */ - __asm__(".p2align 5"); - __asm__("nop"); - __asm__(".p2align 4"); + __asm__(".p2align 6"); +# if __GNUC__ >= 7 + /* good for gcc-7, gcc-9, and gcc-11 */ + __asm__("nop"); + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +# if __GNUC__ == 8 || __GNUC__ == 10 + /* good for gcc-8 and gcc-10 */ + __asm__("nop"); + __asm__(".p2align 3"); +# endif +# endif +#endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ + for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { + size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + if (UNLIKELY(!--nbSeq)) + break; + BIT_reloadDStream(&(seqState.DStream)); + sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + } + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + if (--nbSeq) + BIT_reloadDStream(&(seqState.DStream)); + } + } + } + + if (nbSeq > 0) /* there is remaining lit from extra buffer */ + { + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ != 7 + /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# elif __GNUC__ >= 11 + __asm__(".p2align 3"); +# else + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif #endif + + for (; ; ) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + if (UNLIKELY(!--nbSeq)) + break; + BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ + { + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } + { size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return op-ostart; +} + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body"); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; + { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + + ZSTD_STATIC_ASSERT( + BIT_DStream_unfinished < BIT_DStream_completed && + BIT_DStream_endOfBuffer < BIT_DStream_completed && + BIT_DStream_completed < BIT_DStream_overflow); + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ >= 7 + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# else + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + for ( ; ; ) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch); + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - BIT_reloadDStream(&(seqState.DStream)); op += oneSeqSize; - /* gcc and clang both don't like early returns in this loop. - * Instead break and check for an error at the end of the loop. - */ - if (UNLIKELY(ZSTD_isError(oneSeqSize))) { - error = oneSeqSize; + if (UNLIKELY(!--nbSeq)) break; - } - if (UNLIKELY(!--nbSeq)) break; + BIT_reloadDStream(&(seqState.DStream)); } /* check if reached exact end */ DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); - if (ZSTD_isError(error)) return error; RETURN_ERROR_IF(nbSeq, corruption_detected, ""); RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); /* save reps for next block */ @@ -1229,9 +1632,37 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } + +static size_t +ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +FORCE_INLINE_TEMPLATE size_t +ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) +{ + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; + const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : memory address is only used for prefetching, not for dereferencing */ + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +} + +/* This decoding function employs prefetching + * to reduce latency impact of cache misses. + * It's generally employed when block contains a significant portion of long-distance matches + * or when coupled with a "cold" dictionary */ FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, @@ -1242,11 +1673,11 @@ ZSTD_decompressSequencesLong_body( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; - BYTE* const oend = ostart + maxDstSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); @@ -1254,18 +1685,17 @@ ZSTD_decompressSequencesLong_body( /* Regen sequences */ if (nbSeq) { -#define STORED_SEQS 4 +#define STORED_SEQS 8 #define STORED_SEQS_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 4 +#define ADVANCED_SEQS STORED_SEQS seq_t sequences[STORED_SEQS]; int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); seqState_t seqState; int seqNb; + size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ + dctx->fseEntropy = 1; { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } - seqState.prefixStart = prefixStart; - seqState.pos = (size_t)(op-prefixStart); - seqState.dictEnd = dictEnd; assert(dst != NULL); assert(iend >= ip); RETURN_ERROR_IF( @@ -1277,36 +1707,100 @@ ZSTD_decompressSequencesLong_body( /* prepare in advance */ for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) { - sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch); - PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb] = sequence; } RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, ""); - /* decode and decompress */ - for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); + /* decompress without stomping litBuffer */ + for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) { + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + size_t oneSeqSize; + + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) + { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ - sequences[seqNb & STORED_SEQS_MASK] = sequence; - op += oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ + oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } } RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, ""); /* finish queue */ seqNb -= seqAdvance; for ( ; seqNb<nbSeq ; seqNb++) { - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); + seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) + { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { + size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - op += oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } + } + else + { + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } } /* save reps for next block */ @@ -1314,10 +1808,21 @@ ZSTD_decompressSequencesLong_body( } /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; + if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ + { + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + } + { size_t const lastLLSize = litBufferEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { - ZSTD_memcpy(op, litPtr, lastLLSize); + ZSTD_memmove(op, litPtr, lastLLSize); op += lastLLSize; } } @@ -1341,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, #if DYNAMIC_BMI2 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1351,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, @@ -1383,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, { DEBUGLOG(5, "ZSTD_decompressSequences"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif - return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +static size_t +ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ @@ -1407,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, { DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif @@ -1448,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable) size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame) + const void* src, size_t srcSize, const int frame, const streaming_operation streaming) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; /* isLongOffset must be true if there are long offsets. @@ -1463,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); /* Decode literals section */ - { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; @@ -1511,15 +2040,18 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + if (dctx->litBufferLocation == ZSTD_split) + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + else + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); #endif } } -void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) { - if (dst != dctx->previousDstEnd) { /* not contiguous */ + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ dctx->dictEnd = dctx->previousDstEnd; dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); dctx->prefixStart = dst; @@ -1533,8 +2065,8 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize) { size_t dSize; - ZSTD_checkContinuity(dctx, dst); - dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); + ZSTD_checkContinuity(dctx, dst, dstCapacity); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); dctx->previousDstEnd = (char*)dst + dSize; return dSize; } diff --git a/lib/decompress/zstd_decompress_block.h b/lib/decompress/zstd_decompress_block.h index b5715c168e2b..c61a9d0c4b36 100644 --- a/lib/decompress/zstd_decompress_block.h +++ b/lib/decompress/zstd_decompress_block.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -33,6 +33,12 @@ */ + /* Streaming state is used to inform allocation of the literal buffer */ +typedef enum { + not_streaming = 0, + is_streaming = 1 +} streaming_operation; + /* ZSTD_decompressBlock_internal() : * decompress block, starting at `src`, * into destination buffer `dst`. @@ -41,7 +47,7 @@ */ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame); + const void* src, size_t srcSize, const int frame, const streaming_operation streaming); /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) @@ -54,7 +60,7 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, */ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2); diff --git a/lib/decompress/zstd_decompress_internal.h b/lib/decompress/zstd_decompress_internal.h index f80b471e9940..2b5a53850ac2 100644 --- a/lib/decompress/zstd_decompress_internal.h +++ b/lib/decompress/zstd_decompress_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -20,7 +20,7 @@ * Dependencies *********************************************************/ #include "../common/mem.h" /* BYTE, U16, U32 */ -#include "../common/zstd_internal.h" /* ZSTD_seqSymbol */ +#include "../common/zstd_internal.h" /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */ @@ -40,7 +40,7 @@ static UNUSED_ATTR const U32 OF_base[MaxOff+1] = { 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; -static UNUSED_ATTR const U32 OF_bits[MaxOff+1] = { +static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -99,6 +99,29 @@ typedef enum { ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */ } ZSTD_dictUses_e; +/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */ +typedef struct { + const ZSTD_DDict** ddictPtrTable; + size_t ddictPtrTableSize; + size_t ddictPtrCount; +} ZSTD_DDictHashSet; + +#ifndef ZSTD_DECODER_INTERNAL_BUFFER +# define ZSTD_DECODER_INTERNAL_BUFFER (1 << 16) +#endif + +#define ZSTD_LBMIN 64 +#define ZSTD_LBMAX (128 << 10) + +/* extra buffer, compensates when dst is not large enough to store litBuffer */ +#define ZSTD_LITBUFFEREXTRASIZE BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX) + +typedef enum { + ZSTD_not_in_dst = 0, /* Stored entirely within litExtraBuffer */ + ZSTD_in_dst = 1, /* Stored entirely within dst (in memory after current output write) */ + ZSTD_split = 2 /* Split between litExtraBuffer and dst */ +} ZSTD_litLocation_e; + struct ZSTD_DCtx_s { const ZSTD_seqSymbol* LLTptr; @@ -113,6 +136,7 @@ struct ZSTD_DCtx_s const void* dictEnd; /* end of previous segment */ size_t expected; ZSTD_frameHeader fParams; + U64 processedCSize; U64 decodedSize; blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ ZSTD_dStage stage; @@ -128,7 +152,9 @@ struct ZSTD_DCtx_s size_t litSize; size_t rleSize; size_t staticSize; +#if DYNAMIC_BMI2 != 0 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ +#endif /* dictionary */ ZSTD_DDict* ddictLocal; @@ -136,6 +162,8 @@ struct ZSTD_DCtx_s U32 dictID; int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */ ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ /* streaming */ ZSTD_dStreamStage streamStage; @@ -148,16 +176,21 @@ struct ZSTD_DCtx_s size_t outStart; size_t outEnd; size_t lhSize; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) void* legacyContext; U32 previousLegacyVersion; U32 legacyVersion; +#endif U32 hostageByte; int noForwardProgress; ZSTD_bufferMode_e outBufferMode; ZSTD_outBuffer expectedOutBuffer; /* workspace */ - BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; + BYTE* litBuffer; + const BYTE* litBufferEnd; + ZSTD_litLocation_e litBufferLocation; + BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; size_t oversizedDuration; @@ -166,8 +199,21 @@ struct ZSTD_DCtx_s void const* dictContentBeginForFuzzing; void const* dictContentEndForFuzzing; #endif + + /* Tracing */ +#if ZSTD_TRACE + ZSTD_TraceCtx traceCtx; +#endif }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ +MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +#if DYNAMIC_BMI2 != 0 + return dctx->bmi2; +#else + (void)dctx; + return 0; +#endif +} /*-******************************************************* * Shared internal functions @@ -184,7 +230,7 @@ size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, * If yes, do nothing (continue on current segment). * If not, classify previous segment as "external dictionary", and start a new segment. * This function cannot fail. */ -void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst); +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize); #endif /* ZSTD_DECOMPRESS_INTERNAL_H */ |