From 6cf9dfd3bd62edfff69f11c0f111bc261166e4c7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 31 Aug 2015 15:58:44 +0200 Subject: [PATCH 1/4] net: fib: move metrics parsing to a helper fib_create_info() is already quite large, so before adding more code to the metrics section move that to a helper, similar to ip6_convert_metrics. Suggested-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 71 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1b2d01170a4de8..88afbae893f0d4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -876,6 +876,44 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) return true; } +static int +fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) +{ + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return 0; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + fi->fib_metrics[type - 1] = val; + } + + return 0; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -948,36 +986,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } endfor_nexthops(fi) - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - u32 val; - - if (type > RTAX_MAX) - goto err_inval; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err_inval; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - fi->fib_metrics[type - 1] = val; - } - } - } + err = fib_convert_metrics(fi, cfg); + if (err) + goto failure; if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH From 1bb14807bc761a88bb9d319e7bf519eebf4c82ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:45 +0200 Subject: [PATCH 2/4] net: fib6: reduce identation in ip6_convert_metrics Reduce the identation a bit, there's no need to artificically have it increased. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/route.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 308dd5f9158f17..0261b721b34bbd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1711,26 +1711,26 @@ static int ip6_convert_metrics(struct mx6_config *mxc, nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); + u32 val; - if (type) { - u32 val; - - if (unlikely(type > RTAX_MAX)) - goto err; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; + if (!type) + continue; + if (unlikely(type > RTAX_MAX)) + goto err; - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + goto err; + } else { + val = nla_get_u32(nla); } + + mp[type - 1] = val; + __set_bit(type - 1, mxc->mx_valid); } mxc->mx = mp; From b8d3e4163a3562d7cba486687904383e78e7dd6a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:46 +0200 Subject: [PATCH 3/4] fib, fib6: reject invalid feature bits Feature bits that are invalid should not be accepted by the kernel, only the lower 4 bits may be configured, but not the remaining ones. Even from these 4, 2 of them are unused. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 11 +++++++---- net/ipv4/fib_semantics.c | 2 ++ net/ipv6/route.c | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 0d3d3cc43356e1..702024769c74bc 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -418,10 +418,13 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) -#define RTAX_FEATURE_ECN 0x00000001 -#define RTAX_FEATURE_SACK 0x00000002 -#define RTAX_FEATURE_TIMESTAMP 0x00000004 -#define RTAX_FEATURE_ALLFRAG 0x00000008 +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) struct rta_session { __u8 proto; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 88afbae893f0d4..115a08e70d4324 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -908,6 +908,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; fi->fib_metrics[type - 1] = val; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0261b721b34bbd..8771530df45ea0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1728,6 +1728,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, } else { val = nla_get_u32(nla); } + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + goto err; mp[type - 1] = val; __set_bit(type - 1, mxc->mx_valid); From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:47 +0200 Subject: [PATCH 4/4] tcp: use dctcp if enabled on the route to the initiator Currently, the following case doesn't use DCTCP, even if it should: A responder has f.e. Cubic as system wide default, but for a specific route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The initiating host then uses DCTCP as congestion control, but since the initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok, and we have to fall back to Reno after 3WHS completes. We were thinking on how to solve this in a minimal, non-intrusive way without bloating tcp_ecn_create_request() needlessly: lets cache the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0) is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows to only do a single metric feature lookup inside tcp_ecn_create_request(). Joint work with Florian Westphal. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/dst.h | 6 ++++++ include/net/tcp.h | 2 +- net/core/rtnetlink.c | 6 ++++++ net/ipv4/fib_semantics.c | 6 +++++- net/ipv4/tcp_cong.c | 9 ++++++--- net/ipv4/tcp_input.c | 7 +++++-- net/ipv6/route.c | 9 +++++++-- 7 files changed, 36 insertions(+), 9 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 4c4801645371d1..9261d928303d47 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) p[metric-1] = val; } +/* Kernel-internal feature bits that are unallocated in user space. */ +#define DST_FEATURE_ECN_CA (1 << 31) + +#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) +#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) + static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a7b03947a38bc..0cab28cd43a906 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 788ceed394636e..a466821d1441f2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 115a08e70d4324..992a9597daf80d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -879,6 +879,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) static int fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -898,7 +899,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { @@ -913,6 +914,9 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) fi->fib_metrics[type - 1] = val; } + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + return 0; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a2ed23c595cf18..93c4dc3ab23fdf 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; - u32 key; + u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = __tcp_ca_find_autoload(name); - key = ca ? ca->key : TCP_CA_UNSPEC; + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } rcu_read_unlock(); return key; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc08e235266523..a8f515bb19c4bb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; + u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); + ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); + ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + (ecn_ok_dst & DST_FEATURE_ECN_CA)) inet_rsk(req)->ecn_ok = 1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8771530df45ea0..f45cac6f835639 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1698,6 +1698,7 @@ static int ip6_dst_gc(struct dst_ops *ops) static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; u32 *mp; @@ -1722,7 +1723,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { @@ -1735,8 +1736,12 @@ static int ip6_convert_metrics(struct mx6_config *mxc, __set_bit(type - 1, mxc->mx_valid); } - mxc->mx = mp; + if (ecn_ca) { + __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); + mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + } + mxc->mx = mp; return 0; err: kfree(mp);