diff options
author | Randall Stewart <rrs@FreeBSD.org> | 2018-06-07 18:18:13 +0000 |
---|---|---|
committer | Randall Stewart <rrs@FreeBSD.org> | 2018-06-07 18:18:13 +0000 |
commit | 89e560f441bb214495715039288c99442b3b5aea (patch) | |
tree | 92e58604010cc5bfd9f7e210d979ee8cfa36fcb7 /sys/netinet/tcp_stacks/sack_filter.c | |
parent | ce024bdc0c7e70c5bc32ddd2329ccd04ab747514 (diff) | |
download | src-89e560f441bb214495715039288c99442b3b5aea.tar.gz src-89e560f441bb214495715039288c99442b3b5aea.zip |
This commit brings in a new refactored TCP stack called Rack.
Rack includes the following features:
- A different SACK processing scheme (the old sack structures are not used).
- RACK (Recent acknowledgment) where counting dup-acks is no longer done
instead time is used to knwo when to retransmit. (see the I-D)
- TLP (Tail Loss Probe) where we will probe for tail-losses to attempt
to try not to take a retransmit time-out. (see the I-D)
- Burst mitigation using TCPHTPS
- PRR (partial rate reduction) see the RFC.
Once built into your kernel, you can select this stack by either
socket option with the name of the stack is "rack" or by setting
the global sysctl so the default is rack.
Note that any connection that does not support SACK will be kicked
back to the "default" base FreeBSD stack (currently known as "default").
To build this into your kernel you will need to enable in your
kernel:
makeoptions WITH_EXTRA_TCP_STACKS=1
options TCPHPTS
Sponsored by: Netflix Inc.
Differential Revision: https://reviews.freebsd.org/D15525
Notes
Notes:
svn path=/head/; revision=334804
Diffstat (limited to 'sys/netinet/tcp_stacks/sack_filter.c')
-rw-r--r-- | sys/netinet/tcp_stacks/sack_filter.c | 706 |
1 files changed, 706 insertions, 0 deletions
diff --git a/sys/netinet/tcp_stacks/sack_filter.c b/sys/netinet/tcp_stacks/sack_filter.c new file mode 100644 index 000000000000..993d5851db79 --- /dev/null +++ b/sys/netinet/tcp_stacks/sack_filter.c @@ -0,0 +1,706 @@ +/*- + * Copyright (c) 2017 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/socket.h> +#include <sys/mbuf.h> +#include <sys/sockopt.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_seq.h> +#ifndef _KERNEL +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <strings.h> +#include <stdlib.h> +#include <limits.h> +#include <getopt.h> +#endif +#include "sack_filter.h" + +/* + * Sack filter is used to filter out sacks + * that have already been processed. The idea + * is pretty simple really, consider two sacks + * + * SACK 1 + * cum-ack A + * sack B - C + * SACK 2 + * cum-ack A + * sack D - E + * sack B - C + * + * The previous sack information (B-C) is repeated + * in SACK 2. If the receiver gets SACK 1 and then + * SACK 2 then any work associated with B-C as already + * been completed. This only effects where we may have + * (as in bbr or rack) cases where we walk a linked list. + * + * Now the utility trys to keep everything in a single + * cache line. This means that its not perfect and + * it could be that so big of sack's come that a + * "remembered" processed sack falls off the list and + * so gets re-processed. Thats ok, it just means we + * did some extra work. We could of course take more + * cache line hits by expanding the size of this + * structure, but then that would cost more. + */ + +#ifndef _KERNEL +int detailed_dump = 0; +uint64_t cnt_skipped_oldsack = 0; +uint64_t cnt_used_oldsack = 0; +int highest_used=0; +int over_written=0; +int empty_avail=0; +int no_collapse = 0; +FILE *out = NULL; +FILE *in = NULL; +#endif + +#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits) +#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits) +#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits) + +#ifndef _KERNEL +static +#endif +void +sack_filter_clear(struct sack_filter *sf, tcp_seq seq) +{ + sf->sf_ack = seq; + sf->sf_bits = 0; + sf->sf_cur = 0; + sf->sf_used = 0; +} +/* + * Given a previous sack filter block, filter out + * any entries where the cum-ack moves over them + * fully or partially. + */ +static void +sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) +{ + int32_t i; + /* start with the oldest */ + for (i = 0; i < SACK_FILTER_BLOCKS; i++) { + if (sack_blk_used(sf, i)) { + if (SEQ_GT(th_ack, sf->sf_blks[i].end)) { + /* This block is consumed */ + sf->sf_bits = sack_blk_clr(sf, i); + sf->sf_used--; + } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) { + /* Some of it is acked */ + sf->sf_blks[i].start = th_ack; + /* We could in theory break here, but + * there are some broken implementations + * that send multiple blocks. We want + * to catch them all with similar seq's. + */ + } + } + } + sf->sf_ack = th_ack; +} + +/* + * Return true if you find that + * the sackblock b is on the score + * board. Update it along the way + * if part of it is on the board. + */ +static int32_t +is_sack_on_board(struct sack_filter *sf, struct sackblk *b) +{ + int32_t i, cnt; + for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { + if (sack_blk_used(sf, i)) { + if (SEQ_LT(b->start, sf->sf_ack)) { + /* Behind cum-ack update */ + b->start = sf->sf_ack; + } + if (SEQ_LT(b->end, sf->sf_ack)) { + /* End back behind too */ + b->end = sf->sf_ack; + } + if (b->start == b->end) + return(1); + /* Jonathans Rule 1 */ + if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && + SEQ_GEQ(sf->sf_blks[i].end, b->end)) { + /** + * Our board has this entirely in + * whole or in part: + * + * board |-------------| + * sack |-------------| + * <or> + * board |-------------| + * sack |----| + * + */ + return(1); + } + /* Jonathans Rule 2 */ + if(SEQ_LT(sf->sf_blks[i].end, b->start)) { + /** + * Not near each other: + * + * board |---| + * sack |---| + */ + goto nxt_blk; + } + /* Jonathans Rule 3 */ + if (SEQ_GT(sf->sf_blks[i].start, b->end)) { + /** + * Not near each other: + * + * board |---| + * sack |---| + */ + goto nxt_blk; + } + if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * <or> + * board |--------| + * sack |--------------| + * + * up with this one (we have part of it). + * 1) Update the board block to the new end + * and + * 2) Update the start of this block to my end. + */ + b->start = sf->sf_blks[i].end; + sf->sf_blks[i].end = b->end; + goto nxt_blk; + } + if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * <or> + * board |----| + * sack |----------| + * 1) Update the board block to the new start + * and + * 2) Update the start of this block to my end. + */ + b->end = sf->sf_blks[i].start; + sf->sf_blks[i].start = b->start; + goto nxt_blk; + } + } + nxt_blk: + i++; + i %= SACK_FILTER_BLOCKS; + } + /* Did we totally consume it in pieces? */ + if (b->start != b->end) + return(0); + else + return(1); +} + +static int32_t +sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks) +{ + int32_t num, i; + struct sackblk blkboard[TCP_MAX_SACK]; + /* + * An old sack has arrived. It may contain data + * we do not have. We might not have it since + * we could have had a lost ack <or> we might have the + * entire thing on our current board. We want to prune + * off anything we have. With this function though we + * won't add to the board. + */ + for( i = 0, num = 0; i<numblks; i++ ) { + if (is_sack_on_board(sf, &in[i])) { +#ifndef _KERNEL + cnt_skipped_oldsack++; +#endif + continue; + } + /* Did not find it (or found only + * a piece of it). Copy it to + * our outgoing board. + */ + memcpy(&blkboard[num], &in[i], sizeof(struct sackblk)); +#ifndef _KERNEL + cnt_used_oldsack++; +#endif + num++; + } + if (num) { + memcpy(in, blkboard, (num * sizeof(struct sackblk))); + } + return (num); +} + +/* + * Given idx its used but there is space available + * move the entry to the next free slot + */ +static void +sack_move_to_empty(struct sack_filter *sf, uint32_t idx) +{ + int32_t i, cnt; + + i = (idx + 1) % SACK_FILTER_BLOCKS; + for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) { + if (sack_blk_used(sf, i) == 0) { + memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_clr(sf, idx); + sf->sf_bits = sack_blk_set(sf, i); + return; + } + i++; + i %= SACK_FILTER_BLOCKS; + } +} + +static int32_t +sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +{ + struct sackblk blkboard[TCP_MAX_SACK]; + int32_t num, i; + /* + * First lets trim the old and possibly + * throw any away we have. + */ + for(i=0, num=0; i<numblks; i++) { + if (is_sack_on_board(sf, &in[i])) + continue; + memcpy(&blkboard[num], &in[i], sizeof(struct sackblk)); + num++; + } + if (num == 0) + return(num); + + /* Now what we are left is either + * completely merged on to the board + * from the above steps, or are new + * and need to be added to the board + * with the last one updated to current. + * + * First copy it out we want to return that + * to our caller for processing. + */ + memcpy(in, blkboard, (num * sizeof(struct sackblk))); + numblks = num; + /* Now go through and add to our board as needed */ + for(i=(num-1); i>=0; i--) { + if (is_sack_on_board(sf, &blkboard[i])) + continue; + /* Add this guy its not listed */ + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + if ((sack_blk_used(sf, sf->sf_cur)) && + (sf->sf_used < SACK_FILTER_BLOCKS)) { + sack_move_to_empty(sf, sf->sf_cur); + } +#ifndef _KERNEL + if (sack_blk_used(sf, sf->sf_cur)) { + over_written++; + if (sf->sf_used < SACK_FILTER_BLOCKS) + empty_avail++; + } +#endif + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + if (sack_blk_used(sf, sf->sf_cur) == 0) { + sf->sf_used++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + } + } + return(numblks); +} + +/* + * Given a sack block on the board (the skip index) see if + * any other used entries overlap or meet, if so return the index. + */ +static int32_t +sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip) +{ + int32_t i; + + for(i=0; i<SACK_FILTER_BLOCKS; i++) { + if (sack_blk_used(sf, i) == 0) + continue; + if (i == skip) + continue; + if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) && + SEQ_LEQ(sf->sf_blks[i].end, sb->end) && + SEQ_LEQ(sf->sf_blks[i].start, sb->start)) { + /** + * The two board blocks meet: + * + * board1 |--------| + * board2 |----------| + * <or> + * board1 |--------| + * board2 |--------------| + * <or> + * board1 |--------| + * board2 |--------| + */ + return(i); + } + if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) && + SEQ_GEQ(sf->sf_blks[i].start, sb->start) && + SEQ_GEQ(sf->sf_blks[i].end, sb->end)) { + /** + * The board block partial meets: + * + * board |--------| + * sack |----------| + * <or> + * board |----| + * sack |----------| + * 1) Update the board block to the new start + * and + * 2) Update the start of this block to my end. + */ + return(i); + } + } + return (-1); +} + +/* + * Collapse entry src into entry into + * and free up the src entry afterwards. + */ +static void +sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) +{ + if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { + /* src has a lower starting point */ + sf->sf_blks[into].start = sf->sf_blks[src].start; + } + if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { + /* src has a higher ending point */ + sf->sf_blks[into].end = sf->sf_blks[src].end; + } + sf->sf_bits = sack_blk_clr(sf, src); + sf->sf_used--; +} + +static void +sack_board_collapse(struct sack_filter *sf) +{ + int32_t i, j, i_d, j_d; + + for(i=0; i<SACK_FILTER_BLOCKS; i++) { + if (sack_blk_used(sf, i) == 0) + continue; + /* + * Look at all other blocks but this guy + * to see if they overlap. If so we collapse + * the two blocks together. + */ + j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i); + if (j == -1) { + /* No overlap */ + continue; + } + /* + * Ok j and i overlap with each other, collapse the + * one out furthest away from the current position. + */ + if (sf->sf_cur > i) + i_d = sf->sf_cur - i; + else + i_d = i - sf->sf_cur; + if (sf->sf_cur > j) + j_d = sf->sf_cur - j; + else + j_d = j - sf->sf_cur; + if (j_d > i_d) { + sack_collapse(sf, j, i); + } else + sack_collapse(sf, i, j); + } +} + +#ifndef _KERNEL +static +#endif +int +sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) +{ + int32_t i, ret; + + if (numblks > TCP_MAX_SACK) { + panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", + sf, in, + numblks); + return(numblks); + } + if ((sf->sf_used == 0) && numblks) { + /* + * We are brand new add the blocks in + * reverse order. Note we can see more + * than one in new, since ack's could be lost. + */ + sf->sf_ack = th_ack; + for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { + memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); + sf->sf_bits = sack_blk_set(sf, sf->sf_cur); + sf->sf_cur++; + sf->sf_cur %= SACK_FILTER_BLOCKS; + sf->sf_used++; +#ifndef _KERNEL + if (sf->sf_used > highest_used) + highest_used = sf->sf_used; +#endif + } + if (sf->sf_cur) + sf->sf_cur--; + return(numblks); + } + if (SEQ_GT(th_ack, sf->sf_ack)) { + sack_filter_prune(sf, th_ack); + } + if (numblks) { + if (SEQ_GEQ(th_ack, sf->sf_ack)) { + ret = sack_filter_new(sf, in, numblks, th_ack); + } else { + ret = sack_filter_old(sf, in, numblks); + } + } else + ret = 0; +#ifndef _KERNEL + if ((sf->sf_used > 1) && (no_collapse == 0)) + sack_board_collapse(sf); + +#else + if (sf->sf_used > 1) + sack_board_collapse(sf); + +#endif + return (ret); +} + +#ifndef _KERNEL +uint64_t saved=0; +uint64_t tot_sack_blks=0; + +static void +sack_filter_dump(FILE *out, struct sack_filter *sf) +{ + int i; + fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n", + sf->sf_ack, sf->sf_bits, + sf->sf_cur, sf->sf_used); + + for(i=0; i<SACK_FILTER_BLOCKS; i++) { + if (sack_blk_used(sf, i)) { + fprintf(out, "Entry:%d start:%u end:%u\n", i, + sf->sf_blks[i].start, + sf->sf_blks[i].end); + } + } +} + +int +main(int argc, char **argv) +{ + char buffer[512]; + struct sackblk blks[TCP_MAX_SACK]; + FILE *err; + tcp_seq th_ack, snd_una; + struct sack_filter sf; + int32_t numblks,i; + int snd_una_set=0; + double a, b, c; + int invalid_sack_print = 0; + uint32_t chg_remembered=0; + uint32_t sack_chg=0; + char line_buf[10][256]; + int line_buf_at=0; + + in = stdin; + out = stdout; + while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) { + switch (i) { + case 'n': + no_collapse = 1; + break; + case 'd': + detailed_dump = 1; + break; + case'I': + invalid_sack_print = 1; + break; + case 'i': + in = fopen(optarg, "r"); + if (in == NULL) { + fprintf(stderr, "Fatal error can't open %s for input\n", optarg); + exit(-1); + } + break; + case 'o': + out = fopen(optarg, "w"); + if (out == NULL) { + fprintf(stderr, "Fatal error can't open %s for output\n", optarg); + exit(-1); + } + break; + default: + case '?': + case 'h': + fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]); + return(0); + break; + }; + } + sack_filter_clear(&sf, 0); + memset(buffer, 0, sizeof(buffer)); + memset(blks, 0, sizeof(blks)); + numblks = 0; + fprintf(out, "************************************\n"); + while (fgets(buffer, sizeof(buffer), in) != NULL) { + sprintf(line_buf[line_buf_at], "%s", buffer); + line_buf_at++; + if (strncmp(buffer, "QUIT", 4) == 0) { + break; + } else if (strncmp(buffer, "DONE", 4) == 0) { + int nn, ii; + if (numblks) { + uint32_t szof, tot_chg; + for(ii=0; ii<line_buf_at; ii++) { + fprintf(out, "%s", line_buf[ii]); + } + fprintf(out, "------------------------------------\n"); + nn = sack_filter_blks(&sf, blks, numblks, th_ack); + saved += numblks - nn; + tot_sack_blks += numblks; + fprintf(out, "ACK:%u\n", sf.sf_ack); + for(ii=0, tot_chg=0; ii<nn; ii++) { + szof = blks[ii].end - blks[ii].start; + tot_chg += szof; + fprintf(out, "SACK:%u:%u [%u]\n", + blks[ii].start, + blks[ii].end, szof); + } + fprintf(out,"************************************\n"); + chg_remembered = tot_chg; + if (detailed_dump) { + sack_filter_dump(out, &sf); + fprintf(out,"************************************\n"); + } + } + memset(blks, 0, sizeof(blks)); + memset(line_buf, 0, sizeof(line_buf)); + line_buf_at=0; + numblks = 0; + } else if (strncmp(buffer, "CHG:", 4) == 0) { + sack_chg = strtoul(&buffer[4], NULL, 0); + if ((sack_chg != chg_remembered) && + (sack_chg > chg_remembered)){ + fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n", + sack_chg, chg_remembered + ); + } + sack_chg = chg_remembered = 0; + } else if (strncmp(buffer, "RXT", 3) == 0) { + sack_filter_clear(&sf, snd_una); + } else if (strncmp(buffer, "ACK:", 4) == 0) { + th_ack = strtoul(&buffer[4], NULL, 0); + if (snd_una_set == 0) { + snd_una = th_ack; + snd_una_set = 1; + } else if (SEQ_GT(th_ack, snd_una)) { + snd_una = th_ack; + } + } else if (strncmp(buffer, "EXIT", 4) == 0) { + sack_filter_clear(&sf, snd_una); + sack_chg = chg_remembered = 0; + } else if (strncmp(buffer, "SACK:", 5) == 0) { + char *end=NULL; + uint32_t start; + uint32_t endv; + start = strtoul(&buffer[5], &end, 0); + if (end) { + endv = strtoul(&end[1], NULL, 0); + } else { + fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); + continue; + } + if (SEQ_LT(endv, start)) { + fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); + continue; + } + if (numblks == TCP_MAX_SACK) { + fprintf(out, "--Exceeded max %d\n", numblks); + exit(0); + } + blks[numblks].start = start; + blks[numblks].end = endv; + numblks++; + } + memset(buffer, 0, sizeof(buffer)); + } + if (in != stdin) { + fclose(in); + } + if (out != stdout) { + fclose(out); + } + a = saved * 100.0; + b = tot_sack_blks * 1.0; + if (b > 0.0) + c = a/b; + else + c = 0.0; + if (out != stdout) + err = stdout; + else + err = stderr; + fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n", + saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail); + return(0); +} +#endif |