From 89e560f441bb214495715039288c99442b3b5aea Mon Sep 17 00:00:00 2001 From: Randall Stewart Date: Thu, 7 Jun 2018 18:18:13 +0000 Subject: This commit brings in a new refactored TCP stack called Rack. Rack includes the following features: - A different SACK processing scheme (the old sack structures are not used). - RACK (Recent acknowledgment) where counting dup-acks is no longer done instead time is used to knwo when to retransmit. (see the I-D) - TLP (Tail Loss Probe) where we will probe for tail-losses to attempt to try not to take a retransmit time-out. (see the I-D) - Burst mitigation using TCPHTPS - PRR (partial rate reduction) see the RFC. Once built into your kernel, you can select this stack by either socket option with the name of the stack is "rack" or by setting the global sysctl so the default is rack. Note that any connection that does not support SACK will be kicked back to the "default" base FreeBSD stack (currently known as "default"). To build this into your kernel you will need to enable in your kernel: makeoptions WITH_EXTRA_TCP_STACKS=1 options TCPHPTS Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D15525 --- sys/netinet/tcp_stacks/sack_filter.c | 706 +++++++++++++++++++++++++++++++++++ 1 file changed, 706 insertions(+) create mode 100644 sys/netinet/tcp_stacks/sack_filter.c (limited to 'sys/netinet/tcp_stacks/sack_filter.c') diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c new file mode 100644 index 000000000000..993d5851db79 --- /dev/null +++ b/sys/netinet/tcp_stacks/sack_filter.c @@ -0,0 +1,706 @@ +/*- + * Copyright (c) 2017 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#endif +#include "sack_filter.h" + +/* + * Sack filter is used to filter out sacks + * that have already been processed. The idea + * is pretty simple really, consider two sacks + * + * SACK 1 + * cum-ack A + * sack B - C + * SACK 2 + * cum-ack A + * sack D - E + * sack B - C + * + * The previous sack information (B-C) is repeated + * in SACK 2. If the receiver gets SACK 1 and then + * SACK 2 then any work associated with B-C as already + * been completed. This only effects where we may have + * (as in bbr or rack) cases where we walk a linked list. + * + * Now the utility trys to keep everything in a single + * cache line. This means that its not perfect and + * it could be that so big of sack's come that a + * "remembered" processed sack falls off the list and + * so gets re-processed. Thats ok, it just means we + * did some extra work. We could of course take more + * cache line hits by expanding the size of this + * structure, but then that would cost more. + */ + +#ifndef _KERNEL +int detailed_dump = 0; +uint64_t cnt_skipped_oldsack = 0; +uint64_t cnt_used_oldsack = 0; +int highest_used=0; +int over_written=0; +int empty_avail=0; +int no_collapse = 0; +FILE *out = NULL; +FILE *in = NULL; +#endif + +#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits) +#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits) +#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits) + +#ifndef _KERNEL +static +#endif +void +sack_filter_clear(struct sack_filter *sf, tcp_seq seq) +{ + sf->sf_ack = seq; + sf->sf_bits = 0; + sf->sf_cur = 0; + sf->sf_used = 0; +} +/* + * Given a previous sack filter block, filter out + * any entries where the cum-ack moves over them + * fully or partially. + */ +static void +sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) +{ + int32_t i; + /* start with the oldest */ + for (i = 0; i < SACK_FILTER_BLOCKS; i++) { + if (sack_blk_used(sf, i)) { + if (SEQ_GT(th_ack, sf->sf_blks[i].end)) { + /* This block is consumed */ + sf->sf_bits = sack_blk_clr(sf, i); + sf->sf_used--; + } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) { + /* Some of it is acked */ + sf->sf_blks[i].start = th_ack; + /* We could in theory break here, but + * there are some broken implementations + * that send multiple blocks. We want + * to catch them all with similar seq's. + */ + } + } + } + sf->sf_ack = th_ack; +} + +/* + * Return true if you find that + * the sackblock b is on the score + * board. Update it along the way + * if part of it is on the board. + */ +static int32_t +is_sack_on_board(struct sack_filter *sf, struct sackblk *b) +{ + int32_t i, cnt; + for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { + if (sack_blk_used(sf, i)) { + if (SEQ_LT(b->start, sf->sf_ack)) { + /* Behind cum-ack update */ + b->start = sf->sf_ack; + } + if (SEQ_LT(b->end, sf->sf_ack)) { + /* End back behind too */ + b->end = sf->sf_ack; + } + if (b->start == b->end) + return(1); + /* Jonathans Rule 1 */ + if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && + SEQ_GEQ(sf->sf_blks[i].end, b->end)) { + /** + * Our board has this entirely in + * whole or in part: + * + * board |-------------| + * sack |-------------| + * + * board |-------------| + * sack |----| + * + */ + return(1); + } + /* Jonathans Rule 2 */ + if(SEQ_LT(sf->sf_blks[i].end, b->start)) { + /** + * Not near each other: + * + * board |---| + * sack |---| + */ + goto nxt_blk; + } + /* Jonathans Rule 3 */ + if (SEQ_GT(sf->sf_blks[i].start, b->end)) { + /** + * Not near each other: + * + * board |---| + * sack |---| + */ + goto nxt_blk; + } + if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * + * board |--------| + * sack |--------------| + * + * up with this one (we have part of it). + * 1) Update the board block to the new end + * and + * 2) Update the start of this block to my end. + */ + b->start = sf->sf_blks[i].end; + sf->sf_blks[i].end = b->end; + goto nxt_blk; + } + if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * + * board |----| + * sack |----------| + * 1) Update the board block to the new start + * and + * 2) Update the start of this block to my end. + */ + b->end = sf->sf_blks[i].start; + sf->sf_blks[i].start = b->start; + goto nxt_blk; + } + } + nxt_blk: + i++; + i %= SACK_FILTER_BLOCKS; + } + /* Did we totally consume it in pieces? */ + if (b->start != b->end) + return(0); + else + return(1); +} + +static int32_t +sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks) +{ + int32_t num, i; + struct sackblk blkboard[TCP_MAX_SACK]; + /* + * An old sack has arrived. It may contain data + * we do not have. We might not have it since + * we could have had a lost ack we might have the + * entire thing on our current board. We want to prune + * off anything we have. With this function though we + * won't add to the board. + */ + for( i = 0, num = 0; isf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_clr(sf, idx); + sf->sf_bits = sack_blk_set(sf, i); + return; + } + i++; + i %= SACK_FILTER_BLOCKS; + } +} + +static int32_t +sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +{ + struct sackblk blkboard[TCP_MAX_SACK]; + int32_t num, i; + /* + * First lets trim the old and possibly + * throw any away we have. + */ + for(i=0, num=0; i=0; i--) { + if (is_sack_on_board(sf, &blkboard[i])) + continue; + /* Add this guy its not listed */ + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + if ((sack_blk_used(sf, sf->sf_cur)) && + (sf->sf_used < SACK_FILTER_BLOCKS)) { + sack_move_to_empty(sf, sf->sf_cur); + } +#ifndef _KERNEL + if (sack_blk_used(sf, sf->sf_cur)) { + over_written++; + if (sf->sf_used < SACK_FILTER_BLOCKS) + empty_avail++; + } +#endif + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + if (sack_blk_used(sf, sf->sf_cur) == 0) { + sf->sf_used++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + } + } + return(numblks); +} + +/* + * Given a sack block on the board (the skip index) see if + * any other used entries overlap or meet, if so return the index. + */ +static int32_t +sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip) +{ + int32_t i; + + for(i=0; isf_blks[i].end, sb->start) && + SEQ_LEQ(sf->sf_blks[i].end, sb->end) && + SEQ_LEQ(sf->sf_blks[i].start, sb->start)) { + /** + * The two board blocks meet: + * + * board1 |--------| + * board2 |----------| + * + * board1 |--------| + * board2 |--------------| + * + * board1 |--------| + * board2 |--------| + */ + return(i); + } + if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) && + SEQ_GEQ(sf->sf_blks[i].start, sb->start) && + SEQ_GEQ(sf->sf_blks[i].end, sb->end)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * + * board |----| + * sack |----------| + * 1) Update the board block to the new start + * and + * 2) Update the start of this block to my end. + */ + return(i); + } + } + return (-1); +} + +/* + * Collapse entry src into entry into + * and free up the src entry afterwards. + */ +static void +sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) +{ + if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { + /* src has a lower starting point */ + sf->sf_blks[into].start = sf->sf_blks[src].start; + } + if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { + /* src has a higher ending point */ + sf->sf_blks[into].end = sf->sf_blks[src].end; + } + sf->sf_bits = sack_blk_clr(sf, src); + sf->sf_used--; +} + +static void +sack_board_collapse(struct sack_filter *sf) +{ + int32_t i, j, i_d, j_d; + + for(i=0; isf_blks[i], i); + if (j == -1) { + /* No overlap */ + continue; + } + /* + * Ok j and i overlap with each other, collapse the + * one out furthest away from the current position. + */ + if (sf->sf_cur > i) + i_d = sf->sf_cur - i; + else + i_d = i - sf->sf_cur; + if (sf->sf_cur > j) + j_d = sf->sf_cur - j; + else + j_d = j - sf->sf_cur; + if (j_d > i_d) { + sack_collapse(sf, j, i); + } else + sack_collapse(sf, i, j); + } +} + +#ifndef _KERNEL +static +#endif +int +sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +{ + int32_t i, ret; + + if (numblks > TCP_MAX_SACK) { + panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", + sf, in, + numblks); + return(numblks); + } + if ((sf->sf_used == 0) && numblks) { + /* + * We are brand new add the blocks in + * reverse order. Note we can see more + * than one in new, since ack's could be lost. + */ + sf->sf_ack = th_ack; + for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + sf->sf_used++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + } + if (sf->sf_cur) + sf->sf_cur--; + return(numblks); + } + if (SEQ_GT(th_ack, sf->sf_ack)) { + sack_filter_prune(sf, th_ack); + } + if (numblks) { + if (SEQ_GEQ(th_ack, sf->sf_ack)) { + ret = sack_filter_new(sf, in, numblks, th_ack); + } else { + ret = sack_filter_old(sf, in, numblks); + } + } else + ret = 0; +#ifndef _KERNEL + if ((sf->sf_used > 1) && (no_collapse == 0)) + sack_board_collapse(sf); + +#else + if (sf->sf_used > 1) + sack_board_collapse(sf); + +#endif + return (ret); +} + +#ifndef _KERNEL +uint64_t saved=0; +uint64_t tot_sack_blks=0; + +static void +sack_filter_dump(FILE *out, struct sack_filter *sf) +{ + int i; + fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n", + sf->sf_ack, sf->sf_bits, + sf->sf_cur, sf->sf_used); + + for(i=0; isf_blks[i].start, + sf->sf_blks[i].end); + } + } +} + +int +main(int argc, char **argv) +{ + char buffer[512]; + struct sackblk blks[TCP_MAX_SACK]; + FILE *err; + tcp_seq th_ack, snd_una; + struct sack_filter sf; + int32_t numblks,i; + int snd_una_set=0; + double a, b, c; + int invalid_sack_print = 0; + uint32_t chg_remembered=0; + uint32_t sack_chg=0; + char line_buf[10][256]; + int line_buf_at=0; + + in = stdin; + out = stdout; + while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) { + switch (i) { + case 'n': + no_collapse = 1; + break; + case 'd': + detailed_dump = 1; + break; + case'I': + invalid_sack_print = 1; + break; + case 'i': + in = fopen(optarg, "r"); + if (in == NULL) { + fprintf(stderr, "Fatal error can't open %s for input\n", optarg); + exit(-1); + } + break; + case 'o': + out = fopen(optarg, "w"); + if (out == NULL) { + fprintf(stderr, "Fatal error can't open %s for output\n", optarg); + exit(-1); + } + break; + default: + case '?': + case 'h': + fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]); + return(0); + break; + }; + } + sack_filter_clear(&sf, 0); + memset(buffer, 0, sizeof(buffer)); + memset(blks, 0, sizeof(blks)); + numblks = 0; + fprintf(out, "************************************\n"); + while (fgets(buffer, sizeof(buffer), in) != NULL) { + sprintf(line_buf[line_buf_at], "%s", buffer); + line_buf_at++; + if (strncmp(buffer, "QUIT", 4) == 0) { + break; + } else if (strncmp(buffer, "DONE", 4) == 0) { + int nn, ii; + if (numblks) { + uint32_t szof, tot_chg; + for(ii=0; ii chg_remembered)){ + fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n", + sack_chg, chg_remembered + ); + } + sack_chg = chg_remembered = 0; + } else if (strncmp(buffer, "RXT", 3) == 0) { + sack_filter_clear(&sf, snd_una); + } else if (strncmp(buffer, "ACK:", 4) == 0) { + th_ack = strtoul(&buffer[4], NULL, 0); + if (snd_una_set == 0) { + snd_una = th_ack; + snd_una_set = 1; + } else if (SEQ_GT(th_ack, snd_una)) { + snd_una = th_ack; + } + } else if (strncmp(buffer, "EXIT", 4) == 0) { + sack_filter_clear(&sf, snd_una); + sack_chg = chg_remembered = 0; + } else if (strncmp(buffer, "SACK:", 5) == 0) { + char *end=NULL; + uint32_t start; + uint32_t endv; + start = strtoul(&buffer[5], &end, 0); + if (end) { + endv = strtoul(&end[1], NULL, 0); + } else { + fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); + continue; + } + if (SEQ_LT(endv, start)) { + fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); + continue; + } + if (numblks == TCP_MAX_SACK) { + fprintf(out, "--Exceeded max %d\n", numblks); + exit(0); + } + blks[numblks].start = start; + blks[numblks].end = endv; + numblks++; + } + memset(buffer, 0, sizeof(buffer)); + } + if (in != stdin) { + fclose(in); + } + if (out != stdout) { + fclose(out); + } + a = saved * 100.0; + b = tot_sack_blks * 1.0; + if (b > 0.0) + c = a/b; + else + c = 0.0; + if (out != stdout) + err = stdout; + else + err = stderr; + fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n", + saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail); + return(0); +} +#endif -- cgit v1.2.3