Skip to content

Commit

Permalink
tcp: use dctcp if enabled on the route to the initiator
Browse files Browse the repository at this point in the history
Currently, the following case doesn't use DCTCP, even if it should:
A responder has f.e. Cubic as system wide default, but for a specific
route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The
initiating host then uses DCTCP as congestion control, but since the
initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok,
and we have to fall back to Reno after 3WHS completes.

We were thinking on how to solve this in a minimal, non-intrusive
way without bloating tcp_ecn_create_request() needlessly: lets cache
the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0)
is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES
contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows
to only do a single metric feature lookup inside tcp_ecn_create_request().

Joint work with Florian Westphal.

Signed-off-by: Daniel Borkmann <[email protected]>
Signed-off-by: Florian Westphal <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
borkmann authored and davem330 committed Aug 31, 2015
1 parent b8d3e41 commit c3a8d94
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 9 deletions.
6 changes: 6 additions & 0 deletions include/net/dst.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA (1 << 31)

#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
Expand Down
2 changes: 1 addition & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;

struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name);
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
Expand Down
6 changes: 6 additions & 0 deletions net/core/rtnetlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
} else if (i == RTAX_FEATURES - 1) {
u32 user_features = metrics[i] & RTAX_FEATURE_MASK;

BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
if (nla_put_u32(skb, i + 1, user_features))
goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
Expand Down
6 changes: 5 additions & 1 deletion net/ipv4/fib_semantics.c
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
static int
fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
{
bool ecn_ca = false;
struct nlattr *nla;
int remaining;

Expand All @@ -898,7 +899,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
char tmp[TCP_CA_NAME_MAX];

nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
return -EINVAL;
} else {
Expand All @@ -913,6 +914,9 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
fi->fib_metrics[type - 1] = val;
}

if (ecn_ca)
fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;

return 0;
}

Expand Down
9 changes: 6 additions & 3 deletions net/ipv4/tcp_cong.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);

u32 tcp_ca_get_key_by_name(const char *name)
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
u32 key;
u32 key = TCP_CA_UNSPEC;

might_sleep();

rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
key = ca ? ca->key : TCP_CA_UNSPEC;
if (ca) {
key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
}
rcu_read_unlock();

return key;
Expand Down
7 changes: 5 additions & 2 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
u32 ecn_ok_dst;

if (!th_ecn)
return;

ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;

if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA))
inet_rsk(req)->ecn_ok = 1;
}

Expand Down
9 changes: 7 additions & 2 deletions net/ipv6/route.c
Original file line number Diff line number Diff line change
Expand Up @@ -1698,6 +1698,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg)
{
bool ecn_ca = false;
struct nlattr *nla;
int remaining;
u32 *mp;
Expand All @@ -1722,7 +1723,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
char tmp[TCP_CA_NAME_MAX];

nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
goto err;
} else {
Expand All @@ -1735,8 +1736,12 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
__set_bit(type - 1, mxc->mx_valid);
}

mxc->mx = mp;
if (ecn_ca) {
__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
}

mxc->mx = mp;
return 0;
err:
kfree(mp);
Expand Down

0 comments on commit c3a8d94

Please sign in to comment.