/*-
* Copyright (c) 2017-9 Netflix, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#ifndef _KERNEL
#define _WANT_TCPCB 1
#endif
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/socket.h>
#ifdef _KERNEL
#include <sys/mbuf.h>
#include <sys/sockopt.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_seq.h>
#ifndef _KERNEL
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <limits.h>
#include <getopt.h>
#endif
#include "sack_filter.h"
/*
* Sack filter is used to filter out sacks
* that have already been processed. The idea
* is pretty simple really, consider two sacks
*
* SACK 1
* cum-ack A
* sack B - C
* SACK 2
* cum-ack A
* sack D - E
* sack B - C
*
* The previous sack information (B-C) is repeated
* in SACK 2. If the receiver gets SACK 1 and then
* SACK 2 then any work associated with B-C as already
* been completed. This only effects where we may have
* (as in bbr or rack) cases where we walk a linked list.
*
* Now the utility trys to keep everything in a single
* cache line. This means that its not perfect and
* it could be that so big of sack's come that a
* "remembered" processed sack falls off the list and
* so gets re-processed. Thats ok, it just means we
* did some extra work. We could of course take more
* cache line hits by expanding the size of this
* structure, but then that would cost more.
*/
#ifndef _KERNEL
int detailed_dump = 0;
uint64_t cnt_skipped_oldsack = 0;
uint64_t cnt_used_oldsack = 0;
int highest_used=0;
int over_written=0;
int empty_avail=0;
int no_collapse = 0;
FILE *out = NULL;
FILE *in = NULL;
#endif
#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
#ifndef _KERNEL
static
#endif
void
sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
{
sf->sf_ack = seq;
sf->sf_bits = 0;
sf->sf_cur = 0;
sf->sf_used = 0;
}
/*
* Given a previous sack filter block, filter out
* any entries where the cum-ack moves over them
* fully or partially.
*/
static void
sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
{
int32_t i;
/* start with the oldest */
for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i)) {
if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
/* This block is consumed */
sf->sf_bits = sack_blk_clr(sf, i);
sf->sf_used--;
} else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
/* Some of it is acked */
sf->sf_blks[i].start = th_ack;
/* We could in theory break here, but
* there are some broken implementations
* that send multiple blocks. We want
* to catch them all with similar seq's.
*/
}
}
}
sf->sf_ack = th_ack;
}
/*
* Return true if you find that
* the sackblock b is on the score
* board. Update it along the way
* if part of it is on the board.
*/
static int32_t
is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
{
int32_t i, cnt;
for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
if (sack_blk_used(sf, i)) {
if (SEQ_LT(b->start, sf->sf_ack)) {
/* Behind cum-ack update */
b->start = sf->sf_ack;
}
if (SEQ_LT(b->end, sf->sf_ack)) {
/* End back behind too */
b->end = sf->sf_ack;
}
if (b->start == b->end) {
return(1);
}
/* Jonathans Rule 1 */
if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
/**
* Our board has this entirely in
* whole or in part:
*
* board |-------------|
* sack |-------------|
* <or>
* board |-------------|
* sack |----|
*
*/
return(1);
}
/* Jonathans Rule 2 */
if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
/**
* Not near each other:
*
* board |---|
* sack |---|
*/
goto nxt_blk;
}
/* Jonathans Rule 3 */
if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
/**
* Not near each other:
*
* board |---|
* sack |---|
*/
goto nxt_blk;
}
if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
/**
* The board block partial meets:
*
* board |--------|
* sack |----------|
* <or>
* board |--------|
* sack |--------------|
*
* up with this one (we have part of it).
* 1) Update the board block to the new end
* and
* 2) Update the start of this block to my end.
*/
b->start = sf->sf_blks[i].end;
sf->sf_blks[i].end = b->end;
goto nxt_blk;
}
if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
/**
* The board block partial meets:
*
* board |--------|
* sack |----------|
* <or>
* board |----|
* sack |----------|
* 1) Update the board block to the new start
* and
* 2) Update the start of this block to my end.
*/
b->end = sf->sf_blks[i].start;
sf->sf_blks[i].start = b->start;
goto nxt_blk;
}
}
nxt_blk:
i++;
i %= SACK_FILTER_BLOCKS;
}
/* Did we totally consume it in pieces? */
if (b->start != b->end)
return(0);
else
return(1);
}
static int32_t
sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks)
{
int32_t num, i;
struct sackblk blkboard[TCP_MAX_SACK];
/*
* An old sack has arrived. It may contain data
* we do not have. We might not have it since
* we could have had a lost ack <or> we might have the
* entire thing on our current board. We want to prune
* off anything we have. With this function though we
* won't add to the board.
*/
for( i = 0, num = 0; i<numblks; i++ ) {
if (is_sack_on_board(sf, &in[i])) {
#ifndef _KERNEL
cnt_skipped_oldsack++;
#endif
continue;
}
/* Did not find it (or found only
* a piece of it). Copy it to
* our outgoing board.
*/
memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
#ifndef _KERNEL
cnt_used_oldsack++;
#endif
num++;
}
if (num) {
memcpy(in, blkboard, (num * sizeof(struct sackblk)));
}
return (num);
}
/*
* Given idx its used but there is space available
* move the entry to the next free slot
*/
static void
sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
{
int32_t i, cnt;
i = (idx + 1) % SACK_FILTER_BLOCKS;
for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
if (sack_blk_used(sf, i) == 0) {
memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
sf->sf_bits = sack_blk_clr(sf, idx);
sf->sf_bits = sack_blk_set(sf, i);
return;
}
i++;
i %= SACK_FILTER_BLOCKS;
}
}
static int32_t
sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
{
struct sackblk blkboard[TCP_MAX_SACK];
int32_t num, i;
/*
* First lets trim the old and possibly
* throw any away we have.
*/
for(i=0, num=0; i<numblks; i++) {
if (is_sack_on_board(sf, &in[i]))
continue;
memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
num++;
}
if (num == 0)
return(num);
/* Now what we are left with is either
* completely merged on to the board
* from the above steps, or is new
* and need to be added to the board
* with the last one updated to current.
*
* First copy it out, we want to return that
* to our caller for processing.
*/
memcpy(in, blkboard, (num * sizeof(struct sackblk)));
numblks = num;
/* Now go through and add to our board as needed */
for(i=(num-1); i>=0; i--) {
if (is_sack_on_board(sf, &blkboard[i])) {
continue;
}
/* Add this guy its not listed */
sf->sf_cur++;
sf->sf_cur %= SACK_FILTER_BLOCKS;
if ((sack_blk_used(sf, sf->sf_cur)) &&
(sf->sf_used < SACK_FILTER_BLOCKS)) {
sack_move_to_empty(sf, sf->sf_cur);
}
#ifndef _KERNEL
if (sack_blk_used(sf, sf->sf_cur)) {
over_written++;
if (sf->sf_used < SACK_FILTER_BLOCKS)
empty_avail++;
}
#endif
memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
if (sack_blk_used(sf, sf->sf_cur) == 0) {
sf->sf_used++;
#ifndef _KERNEL
if (sf->sf_used > highest_used)
highest_used = sf->sf_used;
#endif
sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
}
}
return(numblks);
}
/*
* Given a sack block on the board (the skip index) see if
* any other used entries overlap or meet, if so return the index.
*/
static int32_t
sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
{
int32_t i;
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i) == 0)
continue;
if (i == skip)
continue;
if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
/**
* The two board blocks meet:
*
* board1 |--------|
* board2 |----------|
* <or>
* board1 |--------|
* board2 |--------------|
* <or>
* board1 |--------|
* board2 |--------|
*/
return(i);
}
if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
/**
* The board block partial meets:
*
* board |--------|
* sack |----------|
* <or>
* board |----|
* sack |----------|
* 1) Update the board block to the new start
* and
* 2) Update the start of this block to my end.
*/
return(i);
}
}
return (-1);
}
/*
* Collapse entry src into entry into
* and free up the src entry afterwards.
*/
static void
sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
{
if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
/* src has a lower starting point */
sf->sf_blks[into].start = sf->sf_blks[src].start;
}
if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
/* src has a higher ending point */
sf->sf_blks[into].end = sf->sf_blks[src].end;
}
sf->sf_bits = sack_blk_clr(sf, src);
sf->sf_used--;
}
static void
sack_board_collapse(struct sack_filter *sf)
{
int32_t i, j, i_d, j_d;
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i) == 0)
continue;
/*
* Look at all other blocks but this guy
* to see if they overlap. If so we collapse
* the two blocks together.
*/
j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
if (j == -1) {
/* No overlap */
continue;
}
/*
* Ok j and i overlap with each other, collapse the
* one out furthest away from the current position.
*/
if (sf->sf_cur > i)
i_d = sf->sf_cur - i;
else
i_d = i - sf->sf_cur;
if (sf->sf_cur > j)
j_d = sf->sf_cur - j;
else
j_d = j - sf->sf_cur;
if (j_d > i_d) {
sack_collapse(sf, j, i);
} else
sack_collapse(sf, i, j);
}
}
#ifndef _KERNEL
uint64_t saved=0;
uint64_t tot_sack_blks=0;
static void
sack_filter_dump(FILE *out, struct sack_filter *sf)
{
int i;
fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
sf->sf_ack, sf->sf_bits,
sf->sf_cur, sf->sf_used);
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i)) {
fprintf(out, "Entry:%d start:%u end:%u\n", i,
sf->sf_blks[i].start,
sf->sf_blks[i].end);
}
}
}
#endif
#ifndef _KERNEL
static
#endif
int
sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
tcp_seq th_ack)
{
int32_t i, ret;
if (numblks > TCP_MAX_SACK) {
#ifdef _KERNEL
panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
sf, in,
numblks);
#endif
return(numblks);
}
#ifndef _KERNEL
if ((sf->sf_used > 1) && (no_collapse == 0))
sack_board_collapse(sf);
#else
if (sf->sf_used > 1)
sack_board_collapse(sf);
#endif
if ((sf->sf_used == 0) && numblks) {
/*
* We are brand new add the blocks in
* reverse order. Note we can see more
* than one in new, since ack's could be lost.
*/
int cnt_added = 0;
sf->sf_ack = th_ack;
for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
sf->sf_cur++;
sf->sf_cur %= SACK_FILTER_BLOCKS;
sf->sf_used++;
cnt_added++;
#ifndef _KERNEL
if (sf->sf_used > highest_used)
highest_used = sf->sf_used;
#endif
}
if (sf->sf_cur)
sf->sf_cur--;
return (cnt_added);
}
if (SEQ_GT(th_ack, sf->sf_ack)) {
sack_filter_prune(sf, th_ack);
}
if (numblks) {
if (SEQ_GEQ(th_ack, sf->sf_ack)) {
ret = sack_filter_new(sf, in, numblks, th_ack);
} else {
ret = sack_filter_old(sf, in, numblks);
}
} else
ret = 0;
return (ret);
}
void
sack_filter_reject(struct sack_filter *sf, struct sackblk *in)
{
/*
* Given a specified block (that had made
* it past the sack filter). Reject that
* block triming it off any sack-filter block
* that has it. Usually because the block was
* too small and did not cover a whole send.
*
* This function will only "undo" sack-blocks
* that are fresh and touch the edges of
* blocks in our filter.
*/
int i;
for(i=0; i<SACK_FILTER_BLOCKS; i++) {
if (sack_blk_used(sf, i) == 0)
continue;
/*
* Now given the sack-filter block does it touch
* with one of the ends
*/
if (sf->sf_blks[i].end == in->end) {
/* The end moves back to start */
if (SEQ_GT(in->start, sf->sf_blks[i].start))
/* in-blk |----| */
/* sf-blk |---------| */
sf->sf_blks[i].end = in->start;
else {
/* It consumes this block */
/* in-blk |---------| */
/* sf-blk |------| */
/* <or> */
/* sf-blk |---------| */
sf->sf_bits = sack_blk_clr(sf, i);
sf->sf_used--;
}
continue;
}
if (sf->sf_blks[i].start == in->start) {
if (SEQ_LT(in->end, sf->sf_blks[i].end)) {
/* in-blk |----| */
/* sf-blk |---------| */
sf->sf_blks[i].start = in->end;
} else {
/* It consumes this block */
/* in-blk |----------| */
/* sf-blk |-------| */
/* <or> */
/* sf-blk |----------| */
sf->sf_bits = sack_blk_clr(sf, i);
sf->sf_used--;
}
continue;
}
}
}
#ifndef _KERNEL
int
main(int argc, char **argv)
{
char buffer[512];
struct sackblk blks[TCP_MAX_SACK];
FILE *err;
tcp_seq th_ack, snd_una, snd_max = 0;
struct sack_filter sf;
int32_t numblks,i;
int snd_una_set=0;
double a, b, c;
int invalid_sack_print = 0;
uint32_t chg_remembered=0;
uint32_t sack_chg=0;
char line_buf[10][256];
int line_buf_at=0;
in = stdin;
out = stdout;
while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
switch (i) {
case 'n':
no_collapse = 1;
break;
case 'd':
detailed_dump = 1;
break;
case'I':
invalid_sack_print = 1;
break;
case 'i':
in = fopen(optarg, "r");
if (in == NULL) {
fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
exit(-1);
}
break;
case 'o':
out = fopen(optarg, "w");
if (out == NULL) {
fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
exit(-1);
}
break;
default:
case '?':
case 'h':
fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
return(0);
break;
};
}
sack_filter_clear(&sf, 0);
memset(buffer, 0, sizeof(buffer));
memset(blks, 0, sizeof(blks));
numblks = 0;
fprintf(out, "************************************\n");
while (fgets(buffer, sizeof(buffer), in) != NULL) {
sprintf(line_buf[line_buf_at], "%s", buffer);
line_buf_at++;
if (strncmp(buffer, "QUIT", 4) == 0) {
break;
} else if (strncmp(buffer, "DUMP", 4) == 0) {
sack_filter_dump(out, &sf);
} else if (strncmp(buffer, "MAX:", 4) == 0) {
snd_max = strtoul(&buffer[4], NULL, 0);
} else if (strncmp(buffer, "COMMIT", 6) == 0) {
int nn, ii;
if (numblks) {
uint32_t szof, tot_chg;
for(ii=0; ii<line_buf_at; ii++) {
fprintf(out, "%s", line_buf[ii]);
}
fprintf(out, "------------------------------------\n");
nn = sack_filter_blks(&sf, blks, numblks, th_ack);
saved += numblks - nn;
tot_sack_blks += numblks;
fprintf(out, "ACK:%u\n", sf.sf_ack);
for(ii=0, tot_chg=0; ii<nn; ii++) {
szof = blks[ii].end - blks[ii].start;
tot_chg += szof;
fprintf(out, "SACK:%u:%u [%u]\n",
blks[ii].start,
blks[ii].end, szof);
}
fprintf(out,"************************************\n");
chg_remembered = tot_chg;
if (detailed_dump) {
sack_filter_dump(out, &sf);
fprintf(out,"************************************\n");
}
}
memset(blks, 0, sizeof(blks));
memset(line_buf, 0, sizeof(line_buf));
line_buf_at=0;
numblks = 0;
} else if (strncmp(buffer, "CHG:", 4) == 0) {
sack_chg = strtoul(&buffer[4], NULL, 0);
if ((sack_chg != chg_remembered) &&
(sack_chg > chg_remembered)){
fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
sack_chg, chg_remembered
);
}
sack_chg = chg_remembered = 0;
} else if (strncmp(buffer, "RXT", 3) == 0) {
sack_filter_clear(&sf, snd_una);
} else if (strncmp(buffer, "ACK:", 4) == 0) {
th_ack = strtoul(&buffer[4], NULL, 0);
if (snd_una_set == 0) {
snd_una = th_ack;
snd_una_set = 1;
} else if (SEQ_GT(th_ack, snd_una)) {
snd_una = th_ack;
}
} else if (strncmp(buffer, "EXIT", 4) == 0) {
sack_filter_clear(&sf, snd_una);
sack_chg = chg_remembered = 0;
} else if (strncmp(buffer, "SACK:", 5) == 0) {
char *end=NULL;
uint32_t start;
uint32_t endv;
start = strtoul(&buffer[5], &end, 0);
if (end) {
endv = strtoul(&end[1], NULL, 0);
} else {
fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
continue;
}
if (SEQ_GT(endv, snd_max))
snd_max = endv;
if (SEQ_LT(endv, start)) {
fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
continue;
}
if (numblks == TCP_MAX_SACK) {
fprintf(out, "--Exceeded max %d\n", numblks);
exit(0);
}
blks[numblks].start = start;
blks[numblks].end = endv;
numblks++;
} else if (strncmp(buffer, "REJ:n:n", 4) == 0) {
struct sackblk in;
char *end=NULL;
in.start = strtoul(&buffer[4], &end, 0);
if (end) {
in.end = strtoul(&end[1], NULL, 0);
sack_filter_reject(&sf, &in);
} else
fprintf(out, "Invalid input END:A:B\n");
} else if (strncmp(buffer, "HELP", 4) == 0) {
fprintf(out, "You can input:\n");
fprintf(out, "SACK:S:E -- to define a sack block\n");
fprintf(out, "RXT -- to clear the filter without changing the remembered\n");
fprintf(out, "EXIT -- To clear the sack filter and start all fresh\n");
fprintf(out, "ACK:N -- To advance the cum-ack to N\n");
fprintf(out, "MAX:N -- To set send-max to N\n");
fprintf(out, "COMMIT -- To apply the sack you built to the filter and dump the filter\n");
fprintf(out, "DUMP -- To display the current contents of the sack filter\n");
fprintf(out, "QUIT -- To exit this program\n");
} else {
fprintf(out, "Command %s unknown\n", buffer);
}
memset(buffer, 0, sizeof(buffer));
}
if (in != stdin) {
fclose(in);
}
if (out != stdout) {
fclose(out);
}
a = saved * 100.0;
b = tot_sack_blks * 1.0;
if (b > 0.0)
c = a/b;
else
c = 0.0;
if (out != stdout)
err = stdout;
else
err = stderr;
fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);
return(0);
}
#endif