2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
4 * This is from the implementation of CUBIC TCP in
5 * Injong Rhee, Lisong Xu.
6 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
11 * Unless CUBIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
15 #include <linux/config.h>
17 #include <linux/module.h>
21 #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
26 * go to point (max+min)/N
28 #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
30 static int fast_convergence = 1;
31 static int max_increment = 16;
32 static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33 static int initial_ssthresh = 100;
34 static int bic_scale = 41;
35 static int tcp_friendliness = 1;
37 module_param(fast_convergence, int, 0644);
38 MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
39 module_param(max_increment, int, 0644);
40 MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
41 module_param(beta, int, 0644);
42 MODULE_PARM_DESC(beta, "beta for multiplicative increase");
43 module_param(initial_ssthresh, int, 0644);
44 MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
45 module_param(bic_scale, int, 0644);
46 MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
47 module_param(tcp_friendliness, int, 0644);
48 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
51 /* BIC TCP Parameters */
53 u32 cnt; /* increase cwnd by 1 after ACKs */
54 u32 last_max_cwnd; /* last maximum snd_cwnd */
55 u32 loss_cwnd; /* congestion window at last loss */
56 u32 last_cwnd; /* the last snd_cwnd */
57 u32 last_time; /* time when updated last_cwnd */
58 u32 bic_origin_point;/* origin point of bic function */
59 u32 bic_K; /* time to origin point from the beginning of the current epoch */
60 u32 delay_min; /* min delay */
61 u32 epoch_start; /* beginning of an epoch */
62 u32 ack_cnt; /* number of acks */
63 u32 tcp_cwnd; /* estimated tcp cwnd */
64 #define ACK_RATIO_SHIFT 4
65 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
68 static inline void bictcp_reset(struct bictcp *ca)
71 ca->last_max_cwnd = 0;
75 ca->bic_origin_point = 0;
79 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
84 static void bictcp_init(struct sock *sk)
86 bictcp_reset(inet_csk_ca(sk));
88 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
91 /* 65536 times the cubic root */
92 static const u64 cubic_table[8]
93 = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};
96 * calculate the cubic root of x
97 * the basic idea is that x can be expressed as i*8^j
98 * so cubic_root(x) = cubic_root(i)*2^j
99 * in the following code, x is i, and y is 2^j
100 * because of integer calculation, there are errors in calculation
101 * so finally use binary search to find out the exact solution
103 static u32 cubic_root(u64 x)
105 u64 y, app, target, start, end, mid, start_diff, end_diff;
112 /* first estimate lower and upper bound */
118 start = (y*cubic_table[x])>>16;
122 end = (y*cubic_table[x+1]+65535)>>16;
124 /* binary search for more accurate one */
125 while (start < end-1) {
126 mid = (start+end) >> 1;
130 else if (app > target)
136 /* find the most accurate one from start and end */
137 app = start*start*start;
139 start_diff = target - app;
141 start_diff = app - target;
144 end_diff = target - app;
146 end_diff = app - target;
148 if (start_diff < end_diff)
154 static inline u32 bictcp_K(u32 dist, u32 srtt)
161 /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
162 so K = cubic_root( (wmax-cwnd)*rtt/c )
163 the unit of K is bictcp_HZ=2^10, not HZ
166 rtt = (tp->srtt >> 3 ) / HZ
168 the following code has been designed and tested for
169 cwnd < 1 million packets
171 HZ < 1,000,00 (corresponding to 10 nano-second)
175 /* 1/c * 2^2*bictcp_HZ */
176 d32 = (1 << (10+2*BICTCP_HZ)) / bic_scale;
179 /* srtt * 2^count / HZ
180 1) to get a better accuracy of the following d32,
181 the larger the "count", the better the accuracy
182 2) and avoid overflow of the following d64
183 the larger the "count", the high possibility of overflow
184 3) so find a "count" between bictcp_hz-3 and bictcp_hz
185 "count" may be less than bictcp_HZ,
186 then d64 becomes 0. that is OK
190 while (((d32 & 0x80000000)==0) && (count < BICTCP_HZ)){
196 /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */
197 d64 = (d64 * dist * d32) >> (count+3-BICTCP_HZ);
200 d64 = cubic_root(d64);
207 * Compute congestion window to use.
209 static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
212 u32 d32, t, srtt, bic_target, min_cnt, max_cnt;
214 ca->ack_cnt++; /* count the number of ACKs */
216 if (ca->last_cwnd == cwnd &&
217 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
220 ca->last_cwnd = cwnd;
221 ca->last_time = tcp_time_stamp;
223 srtt = (HZ << 3)/10; /* use real time-based growth function */
225 if (ca->epoch_start == 0) {
226 ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
227 ca->ack_cnt = 1; /* start counting */
228 ca->tcp_cwnd = cwnd; /* syn with cubic */
230 if (ca->last_max_cwnd <= cwnd) {
232 ca->bic_origin_point = cwnd;
234 ca->bic_K = bictcp_K(ca->last_max_cwnd-cwnd, srtt);
235 ca->bic_origin_point = ca->last_max_cwnd;
239 /* cubic function - calc*/
240 /* calculate c * time^3 / rtt,
241 * while considering overflow in calculation of time^3
242 * (so time^3 is done by using d64)
243 * and without the support of division of 64bit numbers
244 * (so all divisions are done by using d32)
245 * also NOTE the unit of those veriables
246 * time = (t - K) / 2^bictcp_HZ
247 * c = bic_scale >> 10
248 * rtt = (srtt >> 3) / HZ
249 * !!! The following code does not have overflow problems,
250 * if the cwnd < 1 million packets !!!
253 /* change the unit from HZ to bictcp_HZ */
254 t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
257 if (t < ca->bic_K) /* t - K */
263 d32 = (bic_scale << 3) * HZ / srtt; /* 1024*c/rtt */
264 d64 = (d32 * d64 * d64 * d64) >> (10+3*BICTCP_HZ); /* c/rtt * (t-K)^3 */
266 if (t < ca->bic_K) /* below origin*/
267 bic_target = ca->bic_origin_point - d32;
268 else /* above origin*/
269 bic_target = ca->bic_origin_point + d32;
271 /* cubic function - calc bictcp_cnt*/
272 if (bic_target > cwnd) {
273 ca->cnt = cwnd / (bic_target - cwnd);
275 ca->cnt = 100 * cwnd; /* very small increment*/
278 if (ca->delay_min > 0) {
279 /* max increment = Smax * rtt / 0.1 */
280 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
281 if (ca->cnt < min_cnt)
285 /* slow start and low utilization */
286 if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
290 if (tcp_friendliness) {
291 u32 scale = 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta);
292 d32 = (cwnd * scale) >> 3;
293 while (ca->ack_cnt > d32) { /* update tcp cwnd */
298 if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
299 d32 = ca->tcp_cwnd - cwnd;
300 max_cnt = cwnd / d32;
301 if (ca->cnt > max_cnt)
306 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
307 if (ca->cnt == 0) /* cannot be zero */
312 /* Keep track of minimum rtt */
313 static inline void measure_delay(struct sock *sk)
315 const struct tcp_sock *tp = tcp_sk(sk);
316 struct bictcp *ca = inet_csk_ca(sk);
320 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
321 /* Discard delay samples right after fast recovery */
322 (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
325 delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
329 /* first time call or link delay decreases */
330 if (ca->delay_min == 0 || ca->delay_min > delay)
331 ca->delay_min = delay;
334 static void bictcp_cong_avoid(struct sock *sk, u32 ack,
335 u32 seq_rtt, u32 in_flight, int data_acked)
337 struct tcp_sock *tp = tcp_sk(sk);
338 struct bictcp *ca = inet_csk_ca(sk);
343 if (!tcp_is_cwnd_limited(sk, in_flight))
346 if (tp->snd_cwnd <= tp->snd_ssthresh)
349 bictcp_update(ca, tp->snd_cwnd);
351 /* In dangerous area, increase slowly.
352 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
354 if (tp->snd_cwnd_cnt >= ca->cnt) {
355 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
357 tp->snd_cwnd_cnt = 0;
364 static u32 bictcp_recalc_ssthresh(struct sock *sk)
366 const struct tcp_sock *tp = tcp_sk(sk);
367 struct bictcp *ca = inet_csk_ca(sk);
369 ca->epoch_start = 0; /* end of epoch */
371 /* Wmax and fast convergence */
372 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
373 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
374 / (2 * BICTCP_BETA_SCALE);
376 ca->last_max_cwnd = tp->snd_cwnd;
378 ca->loss_cwnd = tp->snd_cwnd;
380 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
383 static u32 bictcp_undo_cwnd(struct sock *sk)
385 struct bictcp *ca = inet_csk_ca(sk);
387 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
390 static u32 bictcp_min_cwnd(struct sock *sk)
392 return tcp_sk(sk)->snd_ssthresh;
395 static void bictcp_state(struct sock *sk, u8 new_state)
397 if (new_state == TCP_CA_Loss)
398 bictcp_reset(inet_csk_ca(sk));
401 /* Track delayed acknowledgment ratio using sliding window
402 * ratio = (15*ratio + sample) / 16
404 static void bictcp_acked(struct sock *sk, u32 cnt)
406 const struct inet_connection_sock *icsk = inet_csk(sk);
408 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
409 struct bictcp *ca = inet_csk_ca(sk);
410 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
411 ca->delayed_ack += cnt;
416 static struct tcp_congestion_ops cubictcp = {
418 .ssthresh = bictcp_recalc_ssthresh,
419 .cong_avoid = bictcp_cong_avoid,
420 .set_state = bictcp_state,
421 .undo_cwnd = bictcp_undo_cwnd,
422 .min_cwnd = bictcp_min_cwnd,
423 .pkts_acked = bictcp_acked,
424 .owner = THIS_MODULE,
428 static int __init cubictcp_register(void)
430 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
431 return tcp_register_congestion_control(&cubictcp);
434 static void __exit cubictcp_unregister(void)
436 tcp_unregister_congestion_control(&cubictcp);
439 module_init(cubictcp_register);
440 module_exit(cubictcp_unregister);
442 MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
443 MODULE_LICENSE("GPL");
444 MODULE_DESCRIPTION("CUBIC TCP");
445 MODULE_VERSION("2.0");