Index: bgpd/session.c =================================================================== RCS file: /home/cvs/private/hrs/openbgpd/bgpd/session.c,v retrieving revision 1.1.1.8 retrieving revision 1.13 diff -u -p -r1.1.1.8 -r1.13 --- bgpd/session.c 14 Feb 2010 20:19:57 -0000 1.1.1.8 +++ bgpd/session.c 8 Dec 2012 20:17:59 -0000 1.13 @@ -1,4 +1,4 @@ -/* $OpenBSD: session.c,v 1.293 2009/06/07 05:56:24 eric Exp $ */ +/* $OpenBSD: session.c,v 1.325 2012/09/18 09:45:50 claudio Exp $ */ /* * Copyright (c) 2003, 2004, 2005 Henning Brauer @@ -21,18 +21,21 @@ #include #include +#include +#include #include +#include #include #include #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -50,7 +53,12 @@ #define PFD_PIPE_ROUTE_CTL 2 #define PFD_SOCK_CTL 3 #define PFD_SOCK_RCTL 4 -#define PFD_LISTENERS_START 5 +#define PFD_SOCK_PFKEY 5 +#define PFD_LISTENERS_START 6 + +#if defined(__FreeBSD__) /* FreeBSD has no LINK_STATE_IS_UP macro. */ +#define LINK_STATE_IS_UP(_s) ((_s) >= LINK_STATE_UP) +#endif /* defined(__FreeBSD__) */ void session_sighdlr(int); int setup_listeners(u_int *); @@ -65,9 +73,9 @@ void session_accept(int); int session_connect(struct peer *); void session_tcp_established(struct peer *); void session_capa_ann_none(struct peer *); -int session_capa_add(struct peer *, struct buf *, u_int8_t, u_int8_t, - u_int8_t *); -int session_capa_add_mp(struct buf *, u_int16_t, u_int8_t); +int session_capa_add(struct ibuf *, u_int8_t, u_int8_t); +int session_capa_add_mp(struct ibuf *, u_int8_t); +int session_capa_add_gr(struct peer *, struct ibuf *, u_int8_t); struct bgp_msg *session_newmsg(enum msg_type, u_int16_t); int session_sendmsg(struct bgp_msg *, struct peer *); void session_open(struct peer *); @@ -75,30 +83,34 @@ void session_keepalive(struct peer *); void session_update(u_int32_t, void *, size_t); void session_notification(struct peer *, u_int8_t, u_int8_t, void *, ssize_t); -void session_rrefresh(struct peer *, u_int16_t, u_int8_t); +void session_rrefresh(struct peer *, u_int8_t); +int session_graceful_restart(struct peer *); +int session_graceful_is_restarting(struct peer *); +int session_graceful_stop(struct peer *); int session_dispatch_msg(struct pollfd *, struct peer *); +int session_process_msg(struct peer *); int parse_header(struct peer *, u_char *, u_int16_t *, u_int8_t *); int parse_open(struct peer *); int parse_update(struct peer *); int parse_refresh(struct peer *); int parse_notification(struct peer *); int parse_capabilities(struct peer *, u_char *, u_int16_t, u_int32_t *); +int capa_neg_calc(struct peer *); void session_dispatch_imsg(struct imsgbuf *, int, u_int *); void session_up(struct peer *); void session_down(struct peer *); void session_demote(struct peer *, int); -int la_cmp(struct listen_addr *, struct listen_addr *); -struct peer *getpeerbyip(struct sockaddr *); -int session_match_mask(struct peer *, struct sockaddr *); -struct peer *getpeerbyid(u_int32_t); -static struct sockaddr *addr2sa(struct bgpd_addr *, u_int16_t); +int la_cmp(struct listen_addr *, struct listen_addr *); +struct peer *getpeerbyip(struct sockaddr *); +int session_match_mask(struct peer *, struct bgpd_addr *); +struct peer *getpeerbyid(u_int32_t); -struct bgpd_config *conf, *nconf = NULL; +struct bgpd_config *conf, *nconf; struct bgpd_sysdep sysdep; -struct peer *npeers; -volatile sig_atomic_t session_quit = 0; -int pending_reconf = 0; +struct peer *peers, *npeers; +volatile sig_atomic_t session_quit; +int pending_reconf; int csock = -1, rcsock = -1; u_int peer_cnt; struct imsgbuf *ibuf_rde; @@ -106,6 +118,7 @@ struct imsgbuf *ibuf_rde_ctl; struct imsgbuf *ibuf_main; struct mrt_head mrthead; +time_t pauseaccept; void session_sighdlr(int sig) @@ -125,6 +138,22 @@ setup_listeners(u_int *la_cnt) int opt; struct listen_addr *la; u_int cnt = 0; +#if defined(__FreeBSD__) + int s; + + /* Check if TCP_MD5SIG is supported. */ + s = socket(PF_LOCAL, SOCK_STREAM, 0); + if (s < 0) + fatal("socket open for TCP_MD5SIG check"); + opt = TF_SIGNATURE; + if (setsockopt(s, IPPROTO_TCP, TCP_MD5SIG, &opt, sizeof(opt)) == -1) { + if (errno == ENOPROTOOPT || errno == EINVAL) + sysdep.no_md5sig = 1; + else + fatal("setsockopt TCP_MD5SIG"); + } + close(s); +#endif /* defined(__FreeBSD__) */ TAILQ_FOREACH(la, conf->listen_addrs, entry) { la->reconf = RECONF_NONE; @@ -140,6 +169,7 @@ setup_listeners(u_int *la_cnt) } opt = 1; +#if !defined(__FreeBSD__) if (setsockopt(la->fd, IPPROTO_TCP, TCP_MD5SIG, &opt, sizeof(opt)) == -1) { if (errno == ENOPROTOOPT) { /* system w/o md5sig */ @@ -148,6 +178,7 @@ setup_listeners(u_int *la_cnt) } else fatal("setsockopt TCP_MD5SIG"); } +#endif /* !defined(__FreeBSD__) */ /* set ttl to 255 so that ttl-security works */ if (la->sa.ss_family == AF_INET && setsockopt(la->fd, @@ -175,12 +206,10 @@ setup_listeners(u_int *la_cnt) } pid_t -session_main(struct bgpd_config *config, struct peer *cpeers, - struct network_head *net_l, struct filter_head *rules, - struct mrt_head *m_l, struct rib_names *rib_l, int pipe_m2s[2], - int pipe_s2r[2], int pipe_m2r[2], int pipe_s2rctl[2]) +session_main(int pipe_m2s[2], int pipe_s2r[2], int pipe_m2r[2], + int pipe_s2rctl[2]) { - int nfds, timeout; + int nfds, timeout, pfkeysock; unsigned int i, j, idx_peers, idx_listeners, idx_mrts; pid_t pid; u_int pfd_elms = 0, peer_l_elms = 0, mrt_l_elms = 0; @@ -189,19 +218,13 @@ session_main(struct bgpd_config *config, u_int32_t ctl_queued; struct passwd *pw; struct peer *p, **peer_l = NULL, *last, *next; - struct network *net; - struct mrt *m, **mrt_l = NULL; - struct filter_rule *r; + struct mrt *m, *xm, **mrt_l = NULL; struct pollfd *pfd = NULL; struct ctl_conn *ctl_conn; struct listen_addr *la; - struct rde_rib *rr; void *newp; short events; - conf = config; - peers = cpeers; - switch (pid = fork()) { case -1: fatal("cannot fork"); @@ -211,13 +234,6 @@ session_main(struct bgpd_config *config, return (pid); } - /* control socket is outside chroot */ - if ((csock = control_init(0, conf->csock)) == -1) - fatalx("control socket setup failed"); - if (conf->rcsock != NULL && - (rcsock = control_init(1, conf->rcsock)) == -1) - fatalx("control socket setup failed"); - if ((pw = getpwnam(BGPD_USER)) == NULL) fatal(NULL); @@ -228,29 +244,25 @@ session_main(struct bgpd_config *config, setproctitle("session engine"); bgpd_process = PROC_SE; - - if (pfkey_init(&sysdep) == -1) - fatalx("pfkey setup failed"); + pfkeysock = pfkey_init(&sysdep); if (setgroups(1, &pw->pw_gid) || setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) || setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid)) fatal("can't drop privileges"); - listener_cnt = 0; - setup_listeners(&listener_cnt); - signal(SIGTERM, session_sighdlr); signal(SIGINT, session_sighdlr); signal(SIGPIPE, SIG_IGN); signal(SIGHUP, SIG_IGN); - log_info("session engine ready"); + signal(SIGALRM, SIG_IGN); + signal(SIGUSR1, SIG_IGN); + close(pipe_m2s[0]); close(pipe_s2r[1]); close(pipe_s2rctl[1]); close(pipe_m2r[0]); close(pipe_m2r[1]); - init_conf(conf); if ((ibuf_rde = malloc(sizeof(struct imsgbuf))) == NULL || (ibuf_rde_ctl = malloc(sizeof(struct imsgbuf))) == NULL || (ibuf_main = malloc(sizeof(struct imsgbuf))) == NULL) @@ -258,37 +270,21 @@ session_main(struct bgpd_config *config, imsg_init(ibuf_rde, pipe_s2r[0]); imsg_init(ibuf_rde_ctl, pipe_s2rctl[0]); imsg_init(ibuf_main, pipe_m2s[1]); + TAILQ_INIT(&ctl_conns); - control_listen(csock); - control_listen(rcsock); LIST_INIT(&mrthead); + listener_cnt = 0; peer_cnt = 0; ctl_cnt = 0; - /* filter rules are not used in the SE */ - while ((r = TAILQ_FIRST(rules)) != NULL) { - TAILQ_REMOVE(rules, r, entry); - free(r); - } - free(rules); - - /* network list is not used in the SE */ - while ((net = TAILQ_FIRST(net_l)) != NULL) { - TAILQ_REMOVE(net_l, net, entry); - filterset_free(&net->net.attrset); - free(net); - } + if ((conf = calloc(1, sizeof(struct bgpd_config))) == NULL) + fatal(NULL); + if ((conf->listen_addrs = calloc(1, sizeof(struct listen_addrs))) == + NULL) + fatal(NULL); + TAILQ_INIT(conf->listen_addrs); - /* main mrt list is not used in the SE */ - while ((m = LIST_FIRST(m_l)) != NULL) { - LIST_REMOVE(m, entry); - free(m); - } - /* rib names not used in the SE */ - while ((rr = SIMPLEQ_FIRST(&ribnames))) { - SIMPLEQ_REMOVE_HEAD(&ribnames, entry); - free(rr); - } + log_info("session engine ready"); while (session_quit == 0) { /* check for peers to be initialized or deleted */ @@ -308,8 +304,9 @@ session_main(struct bgpd_config *config, /* reinit due? */ if (p->conf.reconf_action == RECONF_REINIT) { - bgp_fsm(p, EVNT_STOP); - timer_set(p, Timer_IdleHold, 0); + session_stop(p, ERR_CEASE_ADMIN_RESET); + if (!p->conf.down) + timer_set(p, Timer_IdleHold, 0); } /* deletion due? */ @@ -317,7 +314,7 @@ session_main(struct bgpd_config *config, if (p->demoted) session_demote(p, -1); p->conf.demote_group[0] = 0; - bgp_fsm(p, EVNT_STOP); + session_stop(p, ERR_CEASE_PEER_UNCONF); log_peer_warnx(&p->conf, "removed"); if (last != NULL) last->next = next; @@ -346,9 +343,17 @@ session_main(struct bgpd_config *config, } mrt_cnt = 0; - LIST_FOREACH(m, &mrthead, entry) + for (m = LIST_FIRST(&mrthead); m != NULL; m = xm) { + xm = LIST_NEXT(m, entry); + if (m->state == MRT_STATE_REMOVE) { + mrt_clean(m); + LIST_REMOVE(m, entry); + free(m); + continue; + } if (m->wbuf.queued) mrt_cnt++; + } if (mrt_cnt > mrt_l_elms) { if ((newp = realloc(mrt_l, sizeof(struct mrt *) * @@ -394,18 +399,31 @@ session_main(struct bgpd_config *config, if (ctl_queued < SESSION_CTL_QUEUE_MAX) /* * Do not act as unlimited buffer. Don't read in more - * messages if the ctl sockets are getting full. + * messages if the ctl sockets are getting full. */ pfd[PFD_PIPE_ROUTE_CTL].events = POLLIN; - pfd[PFD_SOCK_CTL].fd = csock; - pfd[PFD_SOCK_CTL].events = POLLIN; - pfd[PFD_SOCK_RCTL].fd = rcsock; - pfd[PFD_SOCK_RCTL].events = POLLIN; - + if (pauseaccept == 0) { + pfd[PFD_SOCK_CTL].fd = csock; + pfd[PFD_SOCK_CTL].events = POLLIN; + pfd[PFD_SOCK_RCTL].fd = rcsock; + pfd[PFD_SOCK_RCTL].events = POLLIN; + } else { + pfd[PFD_SOCK_CTL].fd = -1; + pfd[PFD_SOCK_RCTL].fd = -1; + } + pfd[PFD_SOCK_PFKEY].fd = pfkeysock; +#if !defined(__FreeBSD__) + pfd[PFD_SOCK_PFKEY].events = POLLIN; +#else + pfd[PFD_SOCK_PFKEY].events = 0; +#endif i = PFD_LISTENERS_START; TAILQ_FOREACH(la, conf->listen_addrs, entry) { - pfd[i].fd = la->fd; - pfd[i].events = POLLIN; + if (pauseaccept == 0) { + pfd[i].fd = la->fd; + pfd[i].events = POLLIN; + } else + pfd[i].fd = -1; i++; } idx_listeners = i; @@ -450,6 +468,10 @@ session_main(struct bgpd_config *config, p->state == STATE_ESTABLISHED) session_demote(p, -1); break; + case Timer_RestartTimeout: + timer_stop(p, Timer_RestartTimeout); + session_graceful_stop(p); + break; default: fatalx("King Bula lost in time"); } @@ -462,6 +484,9 @@ session_main(struct bgpd_config *config, events = POLLIN; if (p->wbuf.queued > 0 || p->state == STATE_CONNECT) events |= POLLOUT; + /* is there still work to do? */ + if (p->rbuf && p->rbuf->wpos) + timeout = 0; /* poll events */ if (p->fd != -1 && events != 0) { @@ -492,12 +517,21 @@ session_main(struct bgpd_config *config, i++; } + if (pauseaccept && timeout > 1) + timeout = 1; if (timeout < 0) timeout = 0; if ((nfds = poll(pfd, i, timeout * 1000)) == -1) if (errno != EINTR) fatal("poll error"); + /* + * If we previously saw fd exhaustion, we stop accept() + * for 1 second to throttle the accept() loop. + */ + if (pauseaccept && getmonotime() > pauseaccept + 1) + pauseaccept = 0; + if (nfds > 0 && pfd[PFD_PIPE_MAIN].revents & POLLOUT) if (msgbuf_write(&ibuf_main->w) < 0) fatal("pipe write error"); @@ -534,6 +568,14 @@ session_main(struct bgpd_config *config, ctl_cnt += control_accept(rcsock, 1); } + if (nfds > 0 && pfd[PFD_SOCK_PFKEY].revents & POLLIN) { + nfds--; + if (pfkey_read(pfkeysock, NULL) == -1) { + log_warnx("pfkey_read failed, exiting..."); + session_quit = 1; + } + } + for (j = PFD_LISTENERS_START; nfds > 0 && j < idx_listeners; j++) if (pfd[j].revents & POLLIN) { @@ -545,6 +587,10 @@ session_main(struct bgpd_config *config, nfds -= session_dispatch_msg(&pfd[j], peer_l[j - idx_listeners]); + for (p = peers; p != NULL; p = p->next) + if (p->rbuf && p->rbuf->wpos) + session_process_msg(p); + for (; nfds > 0 && j < idx_mrts; j++) if (pfd[j].revents & POLLOUT) { nfds--; @@ -557,7 +603,7 @@ session_main(struct bgpd_config *config, while ((p = peers) != NULL) { peers = p->next; - bgp_fsm(p, EVNT_STOP); + session_stop(p, ERR_CEASE_ADMIN_DOWN); pfkey_remove(p); free(p); } @@ -643,10 +689,9 @@ bgp_fsm(struct peer *peer, enum session_ timer_stop(peer, Timer_IdleHold); /* allocate read buffer */ - peer->rbuf = calloc(1, sizeof(struct buf_read)); + peer->rbuf = calloc(1, sizeof(struct ibuf_read)); if (peer->rbuf == NULL) fatal(NULL); - peer->rbuf->wpos = 0; /* init write buffer */ msgbuf_init(&peer->wbuf); @@ -746,7 +791,6 @@ bgp_fsm(struct peer *peer, enum session_ /* ignore */ break; case EVNT_STOP: - session_notification(peer, ERR_CEASE, 0, NULL, 0); change_state(peer, STATE_IDLE, event); break; case EVNT_CON_CLOSED: @@ -780,7 +824,8 @@ bgp_fsm(struct peer *peer, enum session_ change_state(peer, STATE_IDLE, event); break; default: - session_notification(peer, ERR_FSM, 0, NULL, 0); + session_notification(peer, + ERR_FSM, ERR_FSM_UNEX_OPENSENT, NULL, 0); change_state(peer, STATE_IDLE, event); break; } @@ -791,7 +836,6 @@ bgp_fsm(struct peer *peer, enum session_ /* ignore */ break; case EVNT_STOP: - session_notification(peer, ERR_CEASE, 0, NULL, 0); change_state(peer, STATE_IDLE, event); break; case EVNT_CON_CLOSED: @@ -815,7 +859,8 @@ bgp_fsm(struct peer *peer, enum session_ change_state(peer, STATE_IDLE, event); break; default: - session_notification(peer, ERR_FSM, 0, NULL, 0); + session_notification(peer, + ERR_FSM, ERR_FSM_UNEX_OPENCONFIRM, NULL, 0); change_state(peer, STATE_IDLE, event); break; } @@ -826,7 +871,6 @@ bgp_fsm(struct peer *peer, enum session_ /* ignore */ break; case EVNT_STOP: - session_notification(peer, ERR_CEASE, 0, NULL, 0); change_state(peer, STATE_IDLE, event); break; case EVNT_CON_CLOSED: @@ -856,7 +900,8 @@ bgp_fsm(struct peer *peer, enum session_ change_state(peer, STATE_IDLE, event); break; default: - session_notification(peer, ERR_FSM, 0, NULL, 0); + session_notification(peer, + ERR_FSM, ERR_FSM_UNEX_ESTABLISHED, NULL, 0); change_state(peer, STATE_IDLE, event); break; } @@ -885,9 +930,10 @@ start_timer_keepalive(struct peer *peer) void session_close_connection(struct peer *peer) { - if (peer->fd != -1) + if (peer->fd != -1) { close(peer->fd); - + pauseaccept = 0; + } peer->fd = peer->wbuf.fd = -1; } @@ -923,20 +969,31 @@ change_state(struct peer *peer, enum ses timer_stop(peer, Timer_ConnectRetry); timer_stop(peer, Timer_Keepalive); timer_stop(peer, Timer_Hold); + timer_stop(peer, Timer_IdleHold); timer_stop(peer, Timer_IdleHoldReset); session_close_connection(peer); msgbuf_clear(&peer->wbuf); free(peer->rbuf); peer->rbuf = NULL; bzero(&peer->capa.peer, sizeof(peer->capa.peer)); - if (peer->state == STATE_ESTABLISHED) - session_down(peer); + if (event != EVNT_STOP) { timer_set(peer, Timer_IdleHold, peer->IdleHoldTime); if (event != EVNT_NONE && peer->IdleHoldTime < MAX_IDLE_HOLD/2) peer->IdleHoldTime *= 2; } + if (peer->state == STATE_ESTABLISHED) { + if (peer->capa.neg.grestart.restart == 2 && + (event == EVNT_CON_CLOSED || + event == EVNT_CON_FATAL)) { + /* don't punish graceful restart */ + timer_set(peer, Timer_IdleHold, 0); + peer->IdleHoldTime /= 2; + session_graceful_restart(peer); + } else + session_down(peer); + } if (peer->state == STATE_NONE || peer->state == STATE_ESTABLISHED) { /* initialize capability negotiation structures */ @@ -947,6 +1004,20 @@ change_state(struct peer *peer, enum ses } break; case STATE_CONNECT: + if (peer->state == STATE_ESTABLISHED && + peer->capa.neg.grestart.restart == 2) { + /* do the graceful restart dance */ + session_graceful_restart(peer); + peer->holdtime = INTERVAL_HOLD_INITIAL; + timer_stop(peer, Timer_ConnectRetry); + timer_stop(peer, Timer_Keepalive); + timer_stop(peer, Timer_Hold); + timer_stop(peer, Timer_IdleHold); + timer_stop(peer, Timer_IdleHoldReset); + session_close_connection(peer); + msgbuf_clear(&peer->wbuf); + bzero(&peer->capa.peer, sizeof(peer->capa.peer)); + } break; case STATE_ACTIVE: break; @@ -990,7 +1061,10 @@ session_accept(int listenfd) len = sizeof(cliaddr); if ((connfd = accept(listenfd, (struct sockaddr *)&cliaddr, &len)) == -1) { - if (errno == EWOULDBLOCK || errno == EINTR) + if (errno == ENFILE || errno == EMFILE) { + pauseaccept = getmonotime(); + return; + } else if (errno == EWOULDBLOCK || errno == EINTR) return; else log_warn("accept"); @@ -1017,6 +1091,7 @@ session_accept(int listenfd) } } +open: if (p->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) { log_peer_warnx(&p->conf, "ipsec or md5sig configured but not available"); @@ -1049,6 +1124,13 @@ session_accept(int listenfd) } session_socket_blockmode(connfd, BM_NONBLOCK); bgp_fsm(p, EVNT_CON_OPEN); + return; + } else if (p != NULL && p->state == STATE_ESTABLISHED && + p->capa.neg.grestart.restart == 2) { + /* first do the graceful restart dance */ + change_state(p, STATE_CONNECT, EVNT_CON_CLOSED); + /* then do part of the open dance */ + goto open; } else { log_conn_attempt(p, (struct sockaddr *)&cliaddr); close(connfd); @@ -1069,7 +1151,7 @@ session_connect(struct peer *peer) if (peer->fd != -1) return (-1); - if ((peer->fd = socket(peer->conf.remote_addr.af, SOCK_STREAM, + if ((peer->fd = socket(aid2af(peer->conf.remote_addr.aid), SOCK_STREAM, IPPROTO_TCP)) == -1) { log_peer_warn(&peer->conf, "session_connect socket"); bgp_fsm(peer, EVNT_CON_OPENFAIL); @@ -1100,8 +1182,7 @@ session_connect(struct peer *peer) peer->wbuf.fd = peer->fd; /* if update source is set we need to bind() */ - if (peer->conf.local_addr.af) { - sa = addr2sa(&peer->conf.local_addr, 0); + if ((sa = addr2sa(&peer->conf.local_addr, 0)) != NULL) { if (bind(peer->fd, sa, sa->sa_len) == -1) { log_peer_warn(&peer->conf, "session_connect bind"); bgp_fsm(peer, EVNT_CON_OPENFAIL); @@ -1139,42 +1220,50 @@ session_setup_socket(struct peer *p) int nodelay = 1; int bsize; - if (p->conf.ebgp && p->conf.remote_addr.af == AF_INET) { - /* set TTL to foreign router's distance - 1=direct n=multihop - with ttlsec, we always use 255 */ - if (p->conf.ttlsec) { - ttl = 256 - p->conf.distance; - if (setsockopt(p->fd, IPPROTO_IP, IP_MINTTL, &ttl, + switch (p->conf.remote_addr.aid) { + case AID_INET: + /* set precedence, see RFC 1771 appendix 5 */ + if (setsockopt(p->fd, IPPROTO_IP, IP_TOS, &pre, sizeof(pre)) == + -1) { + log_peer_warn(&p->conf, + "session_setup_socket setsockopt TOS"); + return (-1); + } + + if (p->conf.ebgp) { + /* set TTL to foreign router's distance + 1=direct n=multihop with ttlsec, we always use 255 */ + if (p->conf.ttlsec) { + ttl = 256 - p->conf.distance; + if (setsockopt(p->fd, IPPROTO_IP, IP_MINTTL, + &ttl, sizeof(ttl)) == -1) { + log_peer_warn(&p->conf, + "session_setup_socket: " + "setsockopt MINTTL"); + return (-1); + } + ttl = 255; + } + + if (setsockopt(p->fd, IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)) == -1) { log_peer_warn(&p->conf, - "session_setup_socket setsockopt MINTTL"); + "session_setup_socket setsockopt TTL"); return (-1); } - ttl = 255; - } - - if (setsockopt(p->fd, IPPROTO_IP, IP_TTL, &ttl, - sizeof(ttl)) == -1) { - log_peer_warn(&p->conf, - "session_setup_socket setsockopt TTL"); - return (-1); } - } - - if (p->conf.ebgp && p->conf.remote_addr.af == AF_INET6) - /* set hoplimit to foreign router's distance */ - if (setsockopt(p->fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, - sizeof(ttl)) == -1) { - log_peer_warn(&p->conf, - "session_setup_socket setsockopt hoplimit"); - return (-1); + break; + case AID_INET6: + if (p->conf.ebgp) { + /* set hoplimit to foreign router's distance */ + if (setsockopt(p->fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS, + &ttl, sizeof(ttl)) == -1) { + log_peer_warn(&p->conf, + "session_setup_socket setsockopt hoplimit"); + return (-1); + } } - - /* if ttlsec is in use, set minttl */ - if (p->conf.ttlsec) { - ttl = 256 - p->conf.distance; - setsockopt(p->fd, IPPROTO_IP, IP_MINTTL, &ttl, sizeof(ttl)); - + break; } /* set TCP_NODELAY */ @@ -1185,24 +1274,18 @@ session_setup_socket(struct peer *p) return (-1); } - /* set precedence, see RFC 1771 appendix 5 */ - if (p->conf.remote_addr.af == AF_INET && - setsockopt(p->fd, IPPROTO_IP, IP_TOS, &pre, sizeof(pre)) == -1) { - log_peer_warn(&p->conf, - "session_setup_socket setsockopt TOS"); - return (-1); - } - /* only increase bufsize (and thus window) if md5 or ipsec is in use */ if (p->conf.auth.method != AUTH_NONE) { /* try to increase bufsize. no biggie if it fails */ bsize = 65535; - while (setsockopt(p->fd, SOL_SOCKET, SO_RCVBUF, &bsize, - sizeof(bsize)) == -1) + while (bsize > 8192 && + setsockopt(p->fd, SOL_SOCKET, SO_RCVBUF, &bsize, + sizeof(bsize)) == -1 && errno != EINVAL) bsize /= 2; bsize = 65535; - while (setsockopt(p->fd, SOL_SOCKET, SO_SNDBUF, &bsize, - sizeof(bsize)) == -1) + while (bsize > 8192 && + setsockopt(p->fd, SOL_SOCKET, SO_SNDBUF, &bsize, + sizeof(bsize)) == -1 && errno != EINVAL) bsize /= 2; } @@ -1244,40 +1327,56 @@ session_tcp_established(struct peer *pee void session_capa_ann_none(struct peer *peer) { - peer->capa.ann.mp_v4 = SAFI_NONE; - peer->capa.ann.mp_v4 = SAFI_NONE; - peer->capa.ann.refresh = 0; - peer->capa.ann.restart = 0; - peer->capa.ann.as4byte = 0; + bzero(&peer->capa.ann, sizeof(peer->capa.ann)); } int -session_capa_add(struct peer *p, struct buf *opb, u_int8_t capa_code, - u_int8_t capa_len, u_int8_t *optparamlen) -{ - u_int8_t op_type, op_len, tot_len, errs = 0; - - op_type = OPT_PARAM_CAPABILITIES; - op_len = sizeof(capa_code) + sizeof(capa_len) + capa_len; - tot_len = sizeof(op_type) + sizeof(op_len) + op_len; - errs += buf_add(opb, &op_type, sizeof(op_type)); - errs += buf_add(opb, &op_len, sizeof(op_len)); - errs += buf_add(opb, &capa_code, sizeof(capa_code)); - errs += buf_add(opb, &capa_len, sizeof(capa_len)); - *optparamlen += tot_len; +session_capa_add(struct ibuf *opb, u_int8_t capa_code, u_int8_t capa_len) +{ + int errs = 0; + + errs += ibuf_add(opb, &capa_code, sizeof(capa_code)); + errs += ibuf_add(opb, &capa_len, sizeof(capa_len)); return (errs); } int -session_capa_add_mp(struct buf *buf, u_int16_t afi, u_int8_t safi) +session_capa_add_mp(struct ibuf *buf, u_int8_t aid) { - u_int8_t pad = 0; + u_int8_t safi, pad = 0; + u_int16_t afi; int errs = 0; + if (aid2afi(aid, &afi, &safi) == -1) + fatalx("session_capa_add_mp: bad afi/safi pair"); + afi = htons(afi); + errs += ibuf_add(buf, &afi, sizeof(afi)); + errs += ibuf_add(buf, &pad, sizeof(pad)); + errs += ibuf_add(buf, &safi, sizeof(safi)); + + return (errs); +} + +int +session_capa_add_gr(struct peer *p, struct ibuf *b, u_int8_t aid) +{ + u_int errs = 0; + u_int16_t afi; + u_int8_t flags, safi; + + if (aid2afi(aid, &afi, &safi)) { + log_warn("session_capa_add_gr: bad AID"); + return (1); + } + if (p->capa.neg.grestart.flags[aid] & CAPA_GR_RESTARTING) + flags = CAPA_GR_F_FLAG; + else + flags = 0; + afi = htons(afi); - errs += buf_add(buf, &afi, sizeof(afi)); - errs += buf_add(buf, &pad, sizeof(pad)); - errs += buf_add(buf, &safi, sizeof(safi)); + errs += ibuf_add(b, &afi, sizeof(afi)); + errs += ibuf_add(b, &safi, sizeof(safi)); + errs += ibuf_add(b, &flags, sizeof(flags)); return (errs); } @@ -1287,23 +1386,22 @@ session_newmsg(enum msg_type msgtype, u_ { struct bgp_msg *msg; struct msg_header hdr; - struct buf *buf; + struct ibuf *buf; int errs = 0; memset(&hdr.marker, 0xff, sizeof(hdr.marker)); hdr.len = htons(len); hdr.type = msgtype; - if ((buf = buf_open(len)) == NULL) + if ((buf = ibuf_open(len)) == NULL) return (NULL); - errs += buf_add(buf, &hdr.marker, sizeof(hdr.marker)); - errs += buf_add(buf, &hdr.len, sizeof(hdr.len)); - errs += buf_add(buf, &hdr.type, sizeof(hdr.type)); - - if (errs > 0 || - (msg = calloc(1, sizeof(*msg))) == NULL) { - buf_free(buf); + errs += ibuf_add(buf, &hdr.marker, sizeof(hdr.marker)); + errs += ibuf_add(buf, &hdr.len, sizeof(hdr.len)); + errs += ibuf_add(buf, &hdr.type, sizeof(hdr.type)); + + if (errs || (msg = calloc(1, sizeof(*msg))) == NULL) { + ibuf_free(buf); return (NULL); } @@ -1329,7 +1427,7 @@ session_sendmsg(struct bgp_msg *msg, str mrt_dump_bgp_msg(mrt, msg->buf->buf, msg->len, p); } - buf_close(&p->wbuf, msg->buf); + ibuf_close(&p->wbuf, msg->buf); free(msg); return (0); } @@ -1338,40 +1436,70 @@ void session_open(struct peer *p) { struct bgp_msg *buf; - struct buf *opb; + struct ibuf *opb; struct msg_open msg; u_int16_t len; - u_int8_t optparamlen = 0; - u_int errs = 0; + u_int8_t i, op_type, optparamlen = 0; + int errs = 0; + int mpcapa = 0; - if ((opb = buf_dynamic(0, MAX_PKTSIZE - MSGSIZE_OPEN_MIN)) == NULL) { + if ((opb = ibuf_dynamic(0, UCHAR_MAX - sizeof(op_type) - + sizeof(optparamlen))) == NULL) { bgp_fsm(p, EVNT_CON_FATAL); return; } /* multiprotocol extensions, RFC 4760 */ - if (p->capa.ann.mp_v4) { /* 4 bytes data */ - errs += session_capa_add(p, opb, CAPA_MP, 4, &optparamlen); - errs += session_capa_add_mp(opb, AFI_IPv4, p->capa.ann.mp_v4); - } - if (p->capa.ann.mp_v6) { /* 4 bytes data */ - errs += session_capa_add(p, opb, CAPA_MP, 4, &optparamlen); - errs += session_capa_add_mp(opb, AFI_IPv6, p->capa.ann.mp_v6); - } + for (i = 0; i < AID_MAX; i++) + if (p->capa.ann.mp[i]) { /* 4 bytes data */ + errs += session_capa_add(opb, CAPA_MP, 4); + errs += session_capa_add_mp(opb, i); + mpcapa++; + } /* route refresh, RFC 2918 */ if (p->capa.ann.refresh) /* no data */ - errs += session_capa_add(p, opb, CAPA_REFRESH, 0, &optparamlen); + errs += session_capa_add(opb, CAPA_REFRESH, 0); - /* End-of-RIB marker, RFC 4724 */ - if (p->capa.ann.restart) { /* 2 bytes data */ - u_char c[2]; - - bzero(&c, 2); - c[0] = 0x80; /* we're always restarting */ - errs += session_capa_add(p, opb, CAPA_RESTART, 2, &optparamlen); - errs += buf_add(opb, &c, 2); + /* graceful restart and End-of-RIB marker, RFC 4724 */ + if (p->capa.ann.grestart.restart) { + int rst = 0; + u_int16_t hdr; + u_int8_t grlen; + + if (mpcapa) { + grlen = 2 + 4 * mpcapa; + for (i = 0; i < AID_MAX; i++) { + if (p->capa.neg.grestart.flags[i] & + CAPA_GR_RESTARTING) + rst++; + } + } else { /* AID_INET */ + grlen = 2 + 4; + if (p->capa.neg.grestart.flags[AID_INET] & + CAPA_GR_RESTARTING) + rst++; + } + + hdr = conf->holdtime; /* default timeout */ + /* if client does graceful restart don't set R flag */ + if (!rst) + hdr |= CAPA_GR_R_FLAG; + hdr = htons(hdr); + + errs += session_capa_add(opb, CAPA_RESTART, grlen); + errs += ibuf_add(opb, &hdr, sizeof(hdr)); + + if (mpcapa) { + for (i = 0; i < AID_MAX; i++) { + if (p->capa.ann.mp[i]) { + errs += session_capa_add_gr(p, opb, i); + } + } + } else { /* AID_INET */ + errs += session_capa_add_gr(p, opb, AID_INET); + } } /* 4-bytes AS numbers, draft-ietf-idr-as4bytes-13 */ @@ -1379,13 +1507,17 @@ session_open(struct peer *p) u_int32_t nas; nas = htonl(conf->as); - errs += session_capa_add(p, opb, CAPA_AS4BYTE, 4, &optparamlen); - errs += buf_add(opb, &nas, 4); + errs += session_capa_add(opb, CAPA_AS4BYTE, sizeof(nas)); + errs += ibuf_add(opb, &nas, sizeof(nas)); } + if (ibuf_size(opb)) + optparamlen = ibuf_size(opb) + sizeof(op_type) + + sizeof(optparamlen); + len = MSGSIZE_OPEN_MIN + optparamlen; if (errs || (buf = session_newmsg(OPEN, len)) == NULL) { - buf_free(opb); + ibuf_free(opb); bgp_fsm(p, EVNT_CON_FATAL); return; } @@ -1399,19 +1531,24 @@ session_open(struct peer *p) msg.bgpid = conf->bgpid; /* is already in network byte order */ msg.optparamlen = optparamlen; - errs += buf_add(buf->buf, &msg.version, sizeof(msg.version)); - errs += buf_add(buf->buf, &msg.myas, sizeof(msg.myas)); - errs += buf_add(buf->buf, &msg.holdtime, sizeof(msg.holdtime)); - errs += buf_add(buf->buf, &msg.bgpid, sizeof(msg.bgpid)); - errs += buf_add(buf->buf, &msg.optparamlen, sizeof(msg.optparamlen)); - - if (optparamlen) - errs += buf_add(buf->buf, opb->buf, optparamlen); + errs += ibuf_add(buf->buf, &msg.version, sizeof(msg.version)); + errs += ibuf_add(buf->buf, &msg.myas, sizeof(msg.myas)); + errs += ibuf_add(buf->buf, &msg.holdtime, sizeof(msg.holdtime)); + errs += ibuf_add(buf->buf, &msg.bgpid, sizeof(msg.bgpid)); + errs += ibuf_add(buf->buf, &msg.optparamlen, sizeof(msg.optparamlen)); + + if (optparamlen) { + op_type = OPT_PARAM_CAPABILITIES; + optparamlen = ibuf_size(opb); + errs += ibuf_add(buf->buf, &op_type, sizeof(op_type)); + errs += ibuf_add(buf->buf, &optparamlen, sizeof(optparamlen)); + errs += ibuf_add(buf->buf, opb->buf, ibuf_size(opb)); + } - buf_free(opb); + ibuf_free(opb); - if (errs > 0) { - buf_free(buf->buf); + if (errs) { + ibuf_free(buf->buf); free(buf); bgp_fsm(p, EVNT_CON_FATAL); return; @@ -1459,8 +1596,8 @@ session_update(u_int32_t peerid, void *d return; } - if (buf_add(buf->buf, data, datalen)) { - buf_free(buf->buf); + if (ibuf_add(buf->buf, data, datalen)) { + ibuf_free(buf->buf); free(buf); bgp_fsm(p, EVNT_CON_FATAL); return; @@ -1480,29 +1617,27 @@ session_notification(struct peer *p, u_i void *data, ssize_t datalen) { struct bgp_msg *buf; - u_int errs = 0; - u_int8_t null8 = 0; + int errs = 0; if (p->stats.last_sent_errcode) /* some notification already sent */ return; + log_notification(p, errcode, subcode, data, datalen, "sending"); + if ((buf = session_newmsg(NOTIFICATION, MSGSIZE_NOTIFICATION_MIN + datalen)) == NULL) { bgp_fsm(p, EVNT_CON_FATAL); return; } - errs += buf_add(buf->buf, &errcode, sizeof(errcode)); - if (errcode == ERR_CEASE) - errs += buf_add(buf->buf, &null8, sizeof(null8)); - else - errs += buf_add(buf->buf, &subcode, sizeof(subcode)); + errs += ibuf_add(buf->buf, &errcode, sizeof(errcode)); + errs += ibuf_add(buf->buf, &subcode, sizeof(subcode)); if (datalen > 0) - errs += buf_add(buf->buf, data, datalen); + errs += ibuf_add(buf->buf, data, datalen); - if (errs > 0) { - buf_free(buf->buf); + if (errs) { + ibuf_free(buf->buf); free(buf); bgp_fsm(p, EVNT_CON_FATAL); return; @@ -1521,23 +1656,29 @@ session_notification(struct peer *p, u_i int session_neighbor_rrefresh(struct peer *p) { + u_int8_t i; + if (!p->capa.peer.refresh) return (-1); - if (p->capa.peer.mp_v4 != SAFI_NONE) - session_rrefresh(p, AFI_IPv4, p->capa.peer.mp_v4); - if (p->capa.peer.mp_v6 != SAFI_NONE) - session_rrefresh(p, AFI_IPv6, p->capa.peer.mp_v6); + for (i = 0; i < AID_MAX; i++) { + if (p->capa.peer.mp[i] != 0) + session_rrefresh(p, i); + } return (0); } void -session_rrefresh(struct peer *p, u_int16_t afi, u_int8_t safi) +session_rrefresh(struct peer *p, u_int8_t aid) { struct bgp_msg *buf; int errs = 0; - u_int8_t null8 = 0; + u_int16_t afi; + u_int8_t safi, null8 = 0; + + if (aid2afi(aid, &afi, &safi) == -1) + fatalx("session_rrefresh: bad afi/safi pair"); if ((buf = session_newmsg(RREFRESH, MSGSIZE_RREFRESH)) == NULL) { bgp_fsm(p, EVNT_CON_FATAL); @@ -1545,12 +1686,12 @@ session_rrefresh(struct peer *p, u_int16 } afi = htons(afi); - errs += buf_add(buf->buf, &afi, sizeof(afi)); - errs += buf_add(buf->buf, &null8, sizeof(null8)); - errs += buf_add(buf->buf, &safi, sizeof(safi)); + errs += ibuf_add(buf->buf, &afi, sizeof(afi)); + errs += ibuf_add(buf->buf, &null8, sizeof(null8)); + errs += ibuf_add(buf->buf, &safi, sizeof(safi)); - if (errs > 0) { - buf_free(buf->buf); + if (errs) { + ibuf_free(buf->buf); free(buf); bgp_fsm(p, EVNT_CON_FATAL); return; @@ -1565,13 +1706,74 @@ session_rrefresh(struct peer *p, u_int16 } int +session_graceful_restart(struct peer *p) +{ + u_int8_t i; + + timer_set(p, Timer_RestartTimeout, p->capa.neg.grestart.timeout); + + for (i = 0; i < AID_MAX; i++) { + if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) { + if (imsg_compose(ibuf_rde, IMSG_SESSION_STALE, + p->conf.id, 0, -1, &i, sizeof(i)) == -1) + return (-1); + log_peer_warnx(&p->conf, + "graceful restart of %s, keeping routes", + aid2str(i)); + p->capa.neg.grestart.flags[i] |= CAPA_GR_RESTARTING; + } else if (p->capa.neg.mp[i]) { + if (imsg_compose(ibuf_rde, IMSG_SESSION_FLUSH, + p->conf.id, 0, -1, &i, sizeof(i)) == -1) + return (-1); + log_peer_warnx(&p->conf, + "graceful restart of %s, flushing routes", + aid2str(i)); + } + } + return (0); +} + +int +session_graceful_is_restarting(struct peer *p) +{ + u_int8_t i; + + for (i = 0; i < AID_MAX; i++) + if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING) + return (1); + return (0); +} + +int +session_graceful_stop(struct peer *p) +{ + u_int8_t i; + + for (i = 0; i < AID_MAX; i++) { + /* + * Only flush if the peer is restarting and the peer indicated + * it hold the forwarding state. In all other cases the + * session was already flushed when the session came up. + */ + if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING && + p->capa.neg.grestart.flags[i] & CAPA_GR_FORWARD) { + log_peer_warnx(&p->conf, "graceful restart of %s, " + "time-out, flushing", aid2str(i)); + if (imsg_compose(ibuf_rde, IMSG_SESSION_FLUSH, + p->conf.id, 0, -1, &i, sizeof(i)) == -1) + return (-1); + } + p->capa.neg.grestart.flags[i] &= ~CAPA_GR_RESTARTING; + } + return (0); +} + +int session_dispatch_msg(struct pollfd *pfd, struct peer *p) { - ssize_t n, rpos, av, left; + ssize_t n; socklen_t len; - int error, processed = 0; - u_int16_t msglen; - u_int8_t msgtype; + int error; if (p->state == STATE_CONNECT) { if (pfd->revents & POLLOUT) { @@ -1641,71 +1843,83 @@ session_dispatch_msg(struct pollfd *pfd, return (1); } - rpos = 0; - av = p->rbuf->wpos + n; + p->rbuf->wpos += n; p->stats.last_read = time(NULL); + return (1); + } + return (0); +} - /* - * session might drop to IDLE -> buffers deallocated - * we MUST check rbuf != NULL before use - */ - for (;;) { - if (rpos + MSGSIZE_HEADER > av) - break; - if (p->rbuf == NULL) - break; - if (parse_header(p, p->rbuf->buf + rpos, &msglen, - &msgtype) == -1) - return (0); - if (rpos + msglen > av) - break; - p->rbuf->rptr = p->rbuf->buf + rpos; - - switch (msgtype) { - case OPEN: - bgp_fsm(p, EVNT_RCVD_OPEN); - p->stats.msg_rcvd_open++; - break; - case UPDATE: - bgp_fsm(p, EVNT_RCVD_UPDATE); - p->stats.msg_rcvd_update++; - break; - case NOTIFICATION: - bgp_fsm(p, EVNT_RCVD_NOTIFICATION); - p->stats.msg_rcvd_notification++; - break; - case KEEPALIVE: - bgp_fsm(p, EVNT_RCVD_KEEPALIVE); - p->stats.msg_rcvd_keepalive++; - break; - case RREFRESH: - parse_refresh(p); - p->stats.msg_rcvd_rrefresh++; - break; - default: /* cannot happen */ - session_notification(p, ERR_HEADER, - ERR_HDR_TYPE, &msgtype, 1); - log_warnx("received message with " - "unknown type %u", msgtype); - bgp_fsm(p, EVNT_CON_FATAL); - } - rpos += msglen; - if (++processed > MSG_PROCESS_LIMIT) - break; - } - if (p->rbuf == NULL) - return (1); +int +session_process_msg(struct peer *p) +{ + ssize_t rpos, av, left; + int processed = 0; + u_int16_t msglen; + u_int8_t msgtype; - if (rpos < av) { - left = av - rpos; - memcpy(&p->rbuf->buf, p->rbuf->buf + rpos, left); - p->rbuf->wpos = left; - } else - p->rbuf->wpos = 0; + rpos = 0; + av = p->rbuf->wpos; - return (1); + /* + * session might drop to IDLE -> buffers deallocated + * we MUST check rbuf != NULL before use + */ + for (;;) { + if (rpos + MSGSIZE_HEADER > av) + break; + if (p->rbuf == NULL) + break; + if (parse_header(p, p->rbuf->buf + rpos, &msglen, + &msgtype) == -1) + return (0); + if (rpos + msglen > av) + break; + p->rbuf->rptr = p->rbuf->buf + rpos; + + switch (msgtype) { + case OPEN: + bgp_fsm(p, EVNT_RCVD_OPEN); + p->stats.msg_rcvd_open++; + break; + case UPDATE: + bgp_fsm(p, EVNT_RCVD_UPDATE); + p->stats.msg_rcvd_update++; + break; + case NOTIFICATION: + bgp_fsm(p, EVNT_RCVD_NOTIFICATION); + p->stats.msg_rcvd_notification++; + break; + case KEEPALIVE: + bgp_fsm(p, EVNT_RCVD_KEEPALIVE); + p->stats.msg_rcvd_keepalive++; + break; + case RREFRESH: + parse_refresh(p); + p->stats.msg_rcvd_rrefresh++; + break; + default: /* cannot happen */ + session_notification(p, ERR_HEADER, ERR_HDR_TYPE, + &msgtype, 1); + log_warnx("received message with unknown type %u", + msgtype); + bgp_fsm(p, EVNT_CON_FATAL); + } + rpos += msglen; + if (++processed > MSG_PROCESS_LIMIT) + break; } - return (0); + if (p->rbuf == NULL) + return (1); + + if (rpos < av) { + left = av - rpos; + memcpy(&p->rbuf->buf, p->rbuf->buf + rpos, left); + p->rbuf->wpos = left; + } else + p->rbuf->wpos = 0; + + return (1); } int @@ -1853,12 +2067,6 @@ parse_open(struct peer *peer) p += sizeof(short_as); as = peer->short_as = ntohs(short_as); - /* if remote-as is zero and it's a cloned neighbor, accept any */ - if (peer->conf.cloned && !peer->conf.remote_as && as != AS_TRANS) { - peer->conf.remote_as = as; - peer->conf.ebgp = (peer->conf.remote_as != conf->as); - } - memcpy(&oholdtime, p, sizeof(oholdtime)); p += sizeof(oholdtime); @@ -1966,6 +2174,15 @@ parse_open(struct peer *peer) } } + /* if remote-as is zero and it's a cloned neighbor, accept any */ + if (peer->conf.cloned && !peer->conf.remote_as && as != AS_TRANS) { + peer->conf.remote_as = as; + peer->conf.ebgp = (peer->conf.remote_as != conf->as); + if (!peer->conf.ebgp) + /* force enforce_as off for iBGP sessions */ + peer->conf.enforce_as = ENFORCE_AS_OFF; + } + if (peer->conf.remote_as != as) { log_peer_warnx(&peer->conf, "peer sent wrong AS %s", log_as(as)); @@ -1974,6 +2191,14 @@ parse_open(struct peer *peer) return (-1); } + if (capa_neg_calc(peer) == -1) { + log_peer_warnx(&peer->conf, + "capability negotiation calculation failed"); + session_notification(peer, ERR_OPEN, 0, NULL, 0); + change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN); + return (-1); + } + return (0); } @@ -2008,24 +2233,35 @@ int parse_refresh(struct peer *peer) { u_char *p; - struct rrefresh r; + u_int16_t afi; + u_int8_t aid, safi; p = peer->rbuf->rptr; p += MSGSIZE_HEADER; /* header is already checked */ + /* + * We could check if we actually announced the capability but + * as long as the message is correctly encoded we don't care. + */ + /* afi, 2 byte */ - memcpy(&r.afi, p, sizeof(r.afi)); - r.afi = ntohs(r.afi); + memcpy(&afi, p, sizeof(afi)); + afi = ntohs(afi); p += 2; /* reserved, 1 byte */ p += 1; /* safi, 1 byte */ - memcpy(&r.safi, p, sizeof(r.safi)); + memcpy(&safi, p, sizeof(safi)); /* afi/safi unchecked - unrecognized values will be ignored anyway */ + if (afi2aid(afi, safi, &aid) == -1) { + log_peer_warnx(&peer->conf, "peer sent bad refresh, " + "invalid afi/safi pair"); + return (0); + } - if (imsg_compose(ibuf_rde, IMSG_REFRESH, peer->conf.id, 0, -1, &r, - sizeof(r)) == -1) + if (imsg_compose(ibuf_rde, IMSG_REFRESH, peer->conf.id, 0, -1, &aid, + sizeof(aid)) == -1) return (-1); return (0); @@ -2035,11 +2271,12 @@ int parse_notification(struct peer *peer) { u_char *p; + u_int16_t datalen; u_int8_t errcode; u_int8_t subcode; - u_int16_t datalen; u_int8_t capa_code; u_int8_t capa_len; + u_int8_t i; /* just log */ p = peer->rbuf->rptr; @@ -2059,7 +2296,7 @@ parse_notification(struct peer *peer) p += sizeof(subcode); datalen -= sizeof(subcode); - log_notification(peer, errcode, subcode, p, datalen); + log_notification(peer, errcode, subcode, p, datalen, "received"); peer->errcnt++; if (errcode == ERR_OPEN && subcode == ERR_OPEN_CAPA) { @@ -2094,8 +2331,8 @@ parse_notification(struct peer *peer) datalen -= capa_len; switch (capa_code) { case CAPA_MP: - peer->capa.ann.mp_v4 = SAFI_NONE; - peer->capa.ann.mp_v6 = SAFI_NONE; + for (i = 0; i < AID_MAX; i++) + peer->capa.ann.mp[i] = 0; log_peer_warnx(&peer->conf, "disabling multiprotocol capability"); break; @@ -2105,7 +2342,7 @@ parse_notification(struct peer *peer) "disabling route refresh capability"); break; case CAPA_RESTART: - peer->capa.ann.restart = 0; + peer->capa.ann.grestart.restart = 0; log_peer_warnx(&peer->conf, "disabling restart capability"); break; @@ -2139,19 +2376,23 @@ parse_notification(struct peer *peer) int parse_capabilities(struct peer *peer, u_char *d, u_int16_t dlen, u_int32_t *as) { + u_char *capa_val; + u_int32_t remote_as; u_int16_t len; + u_int16_t afi; + u_int16_t gr_header; + u_int8_t safi; + u_int8_t aid; + u_int8_t gr_flags; u_int8_t capa_code; u_int8_t capa_len; - u_char *capa_val; - u_int16_t mp_afi; - u_int8_t mp_safi; - u_int32_t remote_as; + u_int8_t i; len = dlen; while (len > 0) { if (len < 2) { - log_peer_warnx(&peer->conf, "parse_capabilities: " - "expect len >= 2, len is %u", len); + log_peer_warnx(&peer->conf, "Bad capabilities attr " + "length: %u, too short", len); return (-1); } memcpy(&capa_code, d, sizeof(capa_code)); @@ -2163,7 +2404,7 @@ parse_capabilities(struct peer *peer, u_ if (capa_len > 0) { if (len < capa_len) { log_peer_warnx(&peer->conf, - "parse_capabilities: " + "Bad capabilities attr length: " "len %u smaller than capa_len %u", len, capa_len); return (-1); @@ -2178,47 +2419,82 @@ parse_capabilities(struct peer *peer, u_ case CAPA_MP: /* RFC 4760 */ if (capa_len != 4) { log_peer_warnx(&peer->conf, - "parse_capabilities: " - "expect len 4, len is %u", capa_len); - return (-1); - } - memcpy(&mp_afi, capa_val, sizeof(mp_afi)); - mp_afi = ntohs(mp_afi); - memcpy(&mp_safi, capa_val + 3, sizeof(mp_safi)); - switch (mp_afi) { - case AFI_IPv4: - if (mp_safi < 1 || mp_safi > 3) - log_peer_warnx(&peer->conf, - "parse_capabilities: AFI IPv4, " - "mp_safi %u unknown", mp_safi); - else - peer->capa.peer.mp_v4 = mp_safi; + "Bad multi protocol capability length: " + "%u", capa_len); break; - case AFI_IPv6: - if (mp_safi < 1 || mp_safi > 3) - log_peer_warnx(&peer->conf, - "parse_capabilities: AFI IPv6, " - "mp_safi %u unknown", mp_safi); - else - peer->capa.peer.mp_v6 = mp_safi; - break; - default: /* ignore */ + } + memcpy(&afi, capa_val, sizeof(afi)); + afi = ntohs(afi); + memcpy(&safi, capa_val + 3, sizeof(safi)); + if (afi2aid(afi, safi, &aid) == -1) { + log_peer_warnx(&peer->conf, + "Received multi protocol capability: " + " unknown AFI %u, safi %u pair", + afi, safi); break; } + peer->capa.peer.mp[aid] = 1; break; case CAPA_REFRESH: peer->capa.peer.refresh = 1; break; case CAPA_RESTART: - peer->capa.peer.restart = 1; - /* we don't care about the further restart capas yet */ + if (capa_len == 2) { + /* peer only supports EoR marker */ + peer->capa.peer.grestart.restart = 1; + peer->capa.peer.grestart.timeout = 0; + break; + } else if (capa_len % 4 != 2) { + log_peer_warnx(&peer->conf, + "Bad graceful restart capability length: " + "%u", capa_len); + peer->capa.peer.grestart.restart = 0; + peer->capa.peer.grestart.timeout = 0; + break; + } + + memcpy(&gr_header, capa_val, sizeof(gr_header)); + gr_header = ntohs(gr_header); + peer->capa.peer.grestart.timeout = + gr_header & CAPA_GR_TIMEMASK; + if (peer->capa.peer.grestart.timeout == 0) { + log_peer_warnx(&peer->conf, "Received " + "graceful restart timeout is zero"); + peer->capa.peer.grestart.restart = 0; + break; + } + + for (i = 2; i <= capa_len - 4; i += 4) { + memcpy(&afi, capa_val + i, sizeof(afi)); + afi = ntohs(afi); + memcpy(&safi, capa_val + i + 2, sizeof(safi)); + if (afi2aid(afi, safi, &aid) == -1) { + log_peer_warnx(&peer->conf, + "Received graceful restart capa: " + " unknown AFI %u, safi %u pair", + afi, safi); + continue; + } + memcpy(&gr_flags, capa_val + i + 3, + sizeof(gr_flags)); + peer->capa.peer.grestart.flags[aid] |= + CAPA_GR_PRESENT; + if (gr_flags & CAPA_GR_F_FLAG) + peer->capa.peer.grestart.flags[aid] |= + CAPA_GR_FORWARD; + if (gr_header & CAPA_GR_R_FLAG) + peer->capa.peer.grestart.flags[aid] |= + CAPA_GR_RESTART; + peer->capa.peer.grestart.restart = 2; + } break; case CAPA_AS4BYTE: if (capa_len != 4) { log_peer_warnx(&peer->conf, - "parse_capabilities: " - "expect len 4, len is %u", capa_len); - return (-1); + "Bad AS4BYTE capability length: " + "%u", capa_len); + peer->capa.peer.as4byte = 0; + break; } memcpy(&remote_as, capa_val, sizeof(remote_as)); *as = ntohl(remote_as); @@ -2232,6 +2508,66 @@ parse_capabilities(struct peer *peer, u_ return (0); } +int +capa_neg_calc(struct peer *p) +{ + u_int8_t i, hasmp = 0; + + /* refresh: does not realy matter here, use peer setting */ + p->capa.neg.refresh = p->capa.peer.refresh; + + /* as4byte: both side must announce capability */ + if (p->capa.ann.as4byte && p->capa.peer.as4byte) + p->capa.neg.as4byte = 1; + else + p->capa.neg.as4byte = 0; + + /* MP: both side must announce capability */ + for (i = 0; i < AID_MAX; i++) { + if (p->capa.ann.mp[i] && p->capa.peer.mp[i]) { + p->capa.neg.mp[i] = 1; + hasmp = 1; + } else + p->capa.neg.mp[i] = 0; + } + /* if no MP capability present default to IPv4 unicast mode */ + if (!hasmp) + p->capa.neg.mp[AID_INET] = 1; + + /* + * graceful restart: only the peer capabilities are of interest here. + * It is necessary to compare the new values with the previous ones + * and act acordingly. AFI/SAFI that are not part in the MP capability + * are treated as not being present. + */ + + for (i = 0; i < AID_MAX; i++) { + /* disable GR if the AFI/SAFI is not present */ + if (p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT && + p->capa.neg.mp[i] == 0) + p->capa.peer.grestart.flags[i] = 0; /* disable */ + /* look at current GR state and decide what to do */ + if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING) { + if (!(p->capa.peer.grestart.flags[i] & + CAPA_GR_FORWARD)) { + if (imsg_compose(ibuf_rde, IMSG_SESSION_FLUSH, + p->conf.id, 0, -1, &i, sizeof(i)) == -1) + return (-1); + log_peer_warnx(&p->conf, "graceful restart of " + "%s, not restarted, flushing", aid2str(i)); + } + p->capa.neg.grestart.flags[i] = + p->capa.peer.grestart.flags[i] | CAPA_GR_RESTARTING; + } else + p->capa.neg.grestart.flags[i] = + p->capa.peer.grestart.flags[i]; + } + p->capa.neg.grestart.timeout = p->capa.peer.grestart.timeout; + p->capa.neg.grestart.restart = p->capa.peer.grestart.restart; + + return (0); +} + void session_dispatch_imsg(struct imsgbuf *ibuf, int idx, u_int *listener_cnt) { @@ -2244,8 +2580,8 @@ session_dispatch_imsg(struct imsgbuf *ib struct kif *kif; u_char *data; enum reconf_action reconf; - int n, depend_ok; - u_int8_t errcode, subcode; + int n, depend_ok, restricted; + u_int8_t aid, errcode, subcode; if ((n = imsg_read(ibuf)) == -1) fatal("session_dispatch_imsg: imsg_read error"); @@ -2332,15 +2668,42 @@ session_dispatch_imsg(struct imsgbuf *ib } break; + case IMSG_RECONF_CTRL: + if (idx != PFD_PIPE_MAIN) + fatalx("reconf request not from parent"); + if (imsg.hdr.len != IMSG_HEADER_SIZE + + sizeof(restricted)) + fatalx("IFINFO imsg with wrong len"); + memcpy(&restricted, imsg.data, sizeof(restricted)); + if (imsg.fd == -1) { + log_warnx("expected to receive fd for control " + "socket but didn't receive any"); + break; + } + if (restricted) { + control_shutdown(rcsock); + rcsock = imsg.fd; + control_listen(rcsock); + } else { + control_shutdown(csock); + csock = imsg.fd; + control_listen(csock); + } + break; case IMSG_RECONF_DONE: if (idx != PFD_PIPE_MAIN) fatalx("reconf request not from parent"); if (nconf == NULL) fatalx("got IMSG_RECONF_DONE but no config"); + conf->flags = nconf->flags; + conf->log = nconf->log; + conf->bgpid = nconf->bgpid; + conf->clusterid = nconf->clusterid; conf->as = nconf->as; + conf->short_as = nconf->short_as; conf->holdtime = nconf->holdtime; - conf->bgpid = nconf->bgpid; conf->min_holdtime = nconf->min_holdtime; + conf->connectretry = nconf->connectretry; /* add new peers */ for (p = npeers; p != NULL; p = next) { @@ -2388,6 +2751,8 @@ session_dispatch_imsg(struct imsgbuf *ib nconf = NULL; pending_reconf = 0; log_info("SE reconfigured"); + imsg_compose(ibuf_main, IMSG_RECONF_DONE, 0, 0, + -1, NULL, 0); break; case IMSG_IFINFO: if (idx != PFD_PIPE_MAIN) @@ -2397,9 +2762,7 @@ session_dispatch_imsg(struct imsgbuf *ib fatalx("IFINFO imsg with wrong len"); kif = imsg.data; depend_ok = (kif->flags & IFF_UP) && - (LINK_STATE_IS_UP(kif->link_state) || - (kif->link_state == LINK_STATE_UNKNOWN && - kif->media_type != IFT_CARP)); + LINK_STATE_IS_UP(kif->link_state); for (p = peers; p != NULL; p = p->next) if (!strcmp(p->conf.if_depend, kif->ifname)) { @@ -2408,7 +2771,8 @@ session_dispatch_imsg(struct imsgbuf *ib bgp_fsm(p, EVNT_START); } else if (!depend_ok && p->depend_ok) { p->depend_ok = depend_ok; - bgp_fsm(p, EVNT_STOP); + session_stop(p, + ERR_CEASE_OTHER_CHANGE); } } break; @@ -2456,10 +2820,10 @@ session_dispatch_imsg(struct imsgbuf *ib } break; case IMSG_CTL_KROUTE: - case IMSG_CTL_KROUTE6: case IMSG_CTL_KROUTE_ADDR: case IMSG_CTL_SHOW_NEXTHOP: case IMSG_CTL_SHOW_INTERFACE: + case IMSG_CTL_SHOW_FIB_TABLES: if (idx != PFD_PIPE_MAIN) fatalx("ctl kroute request not from parent"); control_imsg_relay(&imsg); @@ -2469,7 +2833,6 @@ session_dispatch_imsg(struct imsgbuf *ib case IMSG_CTL_SHOW_RIB_ATTR: case IMSG_CTL_SHOW_RIB_MEM: case IMSG_CTL_SHOW_NETWORK: - case IMSG_CTL_SHOW_NETWORK6: case IMSG_CTL_SHOW_NEIGHBOR: if (idx != PFD_PIPE_ROUTE_CTL) fatalx("ctl rib request not from RDE"); @@ -2531,6 +2894,40 @@ session_dispatch_imsg(struct imsgbuf *ib break; } break; + case IMSG_SESSION_RESTARTED: + if (idx != PFD_PIPE_ROUTE) + fatalx("update request not from RDE"); + if (imsg.hdr.len < IMSG_HEADER_SIZE + sizeof(aid)) { + log_warnx("RDE sent invalid restart msg"); + break; + } + if ((p = getpeerbyid(imsg.hdr.peerid)) == NULL) { + log_warnx("no such peer: id=%u", + imsg.hdr.peerid); + break; + } + memcpy(&aid, imsg.data, sizeof(aid)); + if (aid >= AID_MAX) + fatalx("IMSG_SESSION_RESTARTED: bad AID"); + if (p->capa.neg.grestart.flags[aid] & + CAPA_GR_RESTARTING && + p->capa.neg.grestart.flags[aid] & + CAPA_GR_FORWARD) { + log_peer_warnx(&p->conf, + "graceful restart of %s finished", + aid2str(aid)); + p->capa.neg.grestart.flags[aid] &= + ~CAPA_GR_RESTARTING; + timer_stop(p, Timer_RestartTimeout); + + /* signal back to RDE to cleanup stale routes */ + if (imsg_compose(ibuf_rde, + IMSG_SESSION_RESTARTED, imsg.hdr.peerid, 0, + -1, &aid, sizeof(aid)) == -1) + fatal("imsg_compose: " + "IMSG_SESSION_RESTARTED"); + } + break; default: break; } @@ -2612,29 +3009,23 @@ getpeerbydesc(const char *descr) struct peer * getpeerbyip(struct sockaddr *ip) { + struct bgpd_addr addr; struct peer *p, *newpeer, *loose = NULL; u_int32_t id; + sa2addr(ip, &addr); + /* we might want a more effective way to find peers by IP */ for (p = peers; p != NULL; p = p->next) if (!p->conf.template && - p->conf.remote_addr.af == ip->sa_family) { - if (p->conf.remote_addr.af == AF_INET && - p->conf.remote_addr.v4.s_addr == - ((struct sockaddr_in *)ip)->sin_addr.s_addr) - return (p); - if (p->conf.remote_addr.af == AF_INET6 && - !bcmp(&p->conf.remote_addr.v6, - &((struct sockaddr_in6 *)ip)->sin6_addr, - sizeof(p->conf.remote_addr.v6))) - return (p); - } + !memcmp(&addr, &p->conf.remote_addr, sizeof(addr))) + return (p); /* try template matching */ for (p = peers; p != NULL; p = p->next) if (p->conf.template && - p->conf.remote_addr.af == ip->sa_family && - session_match_mask(p, ip)) + p->conf.remote_addr.aid == addr.aid && + session_match_mask(p, &addr)) if (loose == NULL || loose->conf.remote_masklen < p->conf.remote_masklen) loose = p; @@ -2653,21 +3044,19 @@ getpeerbyip(struct sockaddr *ip) break; } } - if (newpeer->conf.remote_addr.af == AF_INET) { - newpeer->conf.remote_addr.v4.s_addr = - ((struct sockaddr_in *)ip)->sin_addr.s_addr; + sa2addr(ip, &newpeer->conf.remote_addr); + switch (ip->sa_family) { + case AF_INET: newpeer->conf.remote_masklen = 32; - } - if (newpeer->conf.remote_addr.af == AF_INET6) { - memcpy(&p->conf.remote_addr.v6, - &((struct sockaddr_in6 *)ip)->sin6_addr, - sizeof(newpeer->conf.remote_addr.v6)); + break; + case AF_INET6: newpeer->conf.remote_masklen = 128; + break; } newpeer->conf.template = 0; newpeer->conf.cloned = 1; newpeer->state = newpeer->prev_state = STATE_NONE; - newpeer->conf.reconf_action = RECONF_REINIT; + newpeer->conf.reconf_action = RECONF_KEEP; newpeer->rbuf = NULL; init_peer(newpeer); bgp_fsm(newpeer, EVNT_START); @@ -2680,40 +3069,24 @@ getpeerbyip(struct sockaddr *ip) } int -session_match_mask(struct peer *p, struct sockaddr *ip) +session_match_mask(struct peer *p, struct bgpd_addr *a) { - int i; in_addr_t v4mask; - struct in6_addr *in; - struct in6_addr mask; + struct in6_addr masked; - if (p->conf.remote_addr.af == AF_INET) { + switch (p->conf.remote_addr.aid) { + case AID_INET: v4mask = htonl(prefixlen2mask(p->conf.remote_masklen)); - if (p->conf.remote_addr.v4.s_addr == - ((((struct sockaddr_in *)ip)->sin_addr.s_addr) & v4mask)) + if (p->conf.remote_addr.v4.s_addr == (a->v4.s_addr & v4mask)) return (1); - else - return (0); - } - - if (p->conf.remote_addr.af == AF_INET6) { - bzero(&mask, sizeof(mask)); - for (i = 0; i < p->conf.remote_masklen / 8; i++) - mask.s6_addr[i] = 0xff; - i = p->conf.remote_masklen % 8; - if (i) - mask.s6_addr[p->conf.remote_masklen / 8] = 0xff00 >> i; - - in = &((struct sockaddr_in6 *)ip)->sin6_addr; - - for (i = 0; i < 16; i++) - if ((in->s6_addr[i] & mask.s6_addr[i]) != - p->conf.remote_addr.addr8[i]) - return (0); + return (0); + case AID_INET6: + inet6applymask(&masked, &a->v6, p->conf.remote_masklen); - return (1); + if (!memcmp(&masked, &p->conf.remote_addr.v6, sizeof(masked))) + return (1); + return (0); } - return (0); } @@ -2733,6 +3106,7 @@ getpeerbyid(u_int32_t peerid) void session_down(struct peer *peer) { + bzero(&peer->capa.neg, sizeof(peer->capa.neg)); peer->stats.last_updown = time(NULL); if (imsg_compose(ibuf_rde, IMSG_SESSION_DOWN, peer->conf.id, 0, -1, NULL, 0) == -1) @@ -2744,39 +3118,17 @@ session_up(struct peer *p) { struct session_up sup; - if (imsg_compose(ibuf_rde, IMSG_SESSION_ADD, p->conf.id, 0, -1, - &p->conf, sizeof(p->conf)) == -1) - fatalx("imsg_compose error"); + if (!session_graceful_is_restarting(p)) + if (imsg_compose(ibuf_rde, IMSG_SESSION_ADD, p->conf.id, 0, -1, + &p->conf, sizeof(p->conf)) == -1) + fatalx("imsg_compose error"); - switch (p->sa_local.ss_family) { - case AF_INET: - sup.local_addr.af = AF_INET; - memcpy(&sup.local_addr.v4, - &((struct sockaddr_in *)&p->sa_local)->sin_addr, - sizeof(sup.local_addr.v4)); - sup.remote_addr.af = AF_INET; - memcpy(&sup.remote_addr.v4, - &((struct sockaddr_in *)&p->sa_remote)->sin_addr, - sizeof(sup.remote_addr.v4)); - break; - case AF_INET6: - sup.local_addr.af = AF_INET6; - memcpy(&sup.local_addr.v6, - &((struct sockaddr_in6 *)&p->sa_local)->sin6_addr, - sizeof(sup.local_addr.v6)); - sup.remote_addr.af = AF_INET6; - memcpy(&sup.remote_addr.v6, - &((struct sockaddr_in6 *)&p->sa_remote)->sin6_addr, - sizeof(sup.remote_addr.v6)); - break; - default: - fatalx("session_up: unsupported address family"); - } + sa2addr((struct sockaddr *)&p->sa_local, &sup.local_addr); + sa2addr((struct sockaddr *)&p->sa_remote, &sup.remote_addr); sup.remote_bgpid = p->remote_bgpid; sup.short_as = p->short_as; - memcpy(&sup.capa_announced, &p->capa.ann, sizeof(sup.capa_announced)); - memcpy(&sup.capa_received, &p->capa.peer, sizeof(sup.capa_received)); + memcpy(&sup.capa, &p->capa.neg, sizeof(sup.capa)); p->stats.last_updown = time(NULL); if (imsg_compose(ibuf_rde, IMSG_SESSION_UP, p->conf.id, 0, -1, &sup, sizeof(sup)) == -1) @@ -2784,9 +3136,10 @@ session_up(struct peer *p) } int -imsg_compose_parent(int type, pid_t pid, void *data, u_int16_t datalen) +imsg_compose_parent(int type, u_int32_t peerid, pid_t pid, void *data, + u_int16_t datalen) { - return (imsg_compose(ibuf_main, type, 0, pid, -1, data, datalen)); + return (imsg_compose(ibuf_main, type, peerid, pid, -1, data, datalen)); } int @@ -2795,34 +3148,6 @@ imsg_compose_rde(int type, pid_t pid, vo return (imsg_compose(ibuf_rde, type, 0, pid, -1, data, datalen)); } -static struct sockaddr * -addr2sa(struct bgpd_addr *addr, u_int16_t port) -{ - static struct sockaddr_storage ss; - struct sockaddr_in *sa_in = (struct sockaddr_in *)&ss; - struct sockaddr_in6 *sa_in6 = (struct sockaddr_in6 *)&ss; - - bzero(&ss, sizeof(ss)); - switch (addr->af) { - case AF_INET: - sa_in->sin_family = AF_INET; - sa_in->sin_len = sizeof(struct sockaddr_in); - sa_in->sin_addr.s_addr = addr->v4.s_addr; - sa_in->sin_port = htons(port); - break; - case AF_INET6: - sa_in6->sin6_family = AF_INET6; - sa_in6->sin6_len = sizeof(struct sockaddr_in6); - memcpy(&sa_in6->sin6_addr, &addr->v6, - sizeof(sa_in6->sin6_addr)); - sa_in6->sin6_port = htons(port); - sa_in6->sin6_scope_id = addr->scope_id; - break; - } - - return ((struct sockaddr *)&ss); -} - void session_demote(struct peer *p, int level) { @@ -2837,3 +3162,19 @@ session_demote(struct peer *p, int level p->demoted += level; } + +void +session_stop(struct peer *peer, u_int8_t subcode) +{ + switch (peer->state) { + case STATE_OPENSENT: + case STATE_OPENCONFIRM: + case STATE_ESTABLISHED: + session_notification(peer, ERR_CEASE, subcode, NULL, 0); + break; + default: + /* session not open, no need to send notification */ + break; + } + bgp_fsm(peer, EVNT_STOP); +}