patch-2.1.36 linux/net/ipv4/tcp_input.c
Next file: linux/net/ipv4/tcp_ipv4.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index
- Lines: 2294
- Date:
Tue Apr 22 22:46:28 1997
- Orig file:
v2.1.35/linux/net/ipv4/tcp_input.c
- Orig date:
Mon Apr 14 16:28:28 1997
diff -u --recursive --new-file v2.1.35/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.42 1997/04/12 04:32:24 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.50 1997/04/22 02:53:12 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -57,6 +57,12 @@
u32 seq_rtt);
int sysctl_tcp_cong_avoidance = 0;
+int sysctl_tcp_hoe_retransmits = 0;
+int sysctl_tcp_sack = 0;
+int sysctl_tcp_tsack = 0;
+int sysctl_tcp_timestamps = 0;
+int sysctl_tcp_window_scaling = 0;
+
static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
@@ -72,9 +78,7 @@
{
int m;
- /*
- * Delayed ACK time estimator.
- */
+ /* Delayed ACK time estimator. */
m = jiffies - tp->lrcvtime;
@@ -83,12 +87,10 @@
if (m < 0)
return;
- /*
- * if the mesured value is bigger than
+ /* if the mesured value is bigger than
* twice the round trip time ignore it.
*/
- if ((m << 2) <= tp->srtt)
- {
+ if ((m << 2) <= tp->srtt) {
m -= (tp->iat >> 3);
tp->iat += m;
@@ -102,18 +104,21 @@
if (tp->ato < HZ/50)
tp->ato = HZ/50;
- }
- else
+ } else
tp->ato = 0;
}
-/*
- * Called on frames that were known _not_ to have been
- * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87].
- * The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
*/
-extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
+static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
long m;
/*
@@ -122,8 +127,7 @@
* are scaled versions of rtt and mean deviation.
* This is designed to be as fast as possible
* m stands for "measurement".
- */
- /*
+ *
* On a 1990 paper the rto value is changed to:
* RTO = rtt + 4 * mdev
*/
@@ -140,44 +144,73 @@
m -= (tp->mdev >> 2); /* similar update on mdev */
tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
} else {
- /* no previous measure. */
+ /* no previous measure. */
tp->srtt = m<<3; /* take the measured time to be rtt */
tp->mdev = m<<2; /* make sure rto = 3*rtt */
}
+}
+/* Calculate rto without backoff. This is the second half of Van Jacobsons
+ * routine refered to above.
+ */
- /*
- * Now update timeout. Note that this removes any backoff.
- */
-
+static __inline__ void tcp_set_rto(struct tcp_opt *tp)
+{
tp->rto = (tp->srtt >> 3) + tp->mdev;
tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
+}
+
+/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
+ * on packet lifetime in the internet. We need the HZ/5 lower
+ * bound to behave correctly against BSD stacks with a fixed
+ * delayed ack.
+ * FIXME: It's not entirely clear this lower bound is the best
+ * way to avoid the problem. Is it possible to drop the lower
+ * bound and still avoid trouble with BSD stacks? Perhaps
+ * some modification to the RTO calculation that takes delayed
+ * ack bais into account? This needs serious thought. -- erics
+ */
+static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
+{
if (tp->rto > 120*HZ)
tp->rto = 120*HZ;
-
- /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks
- * FIXME: It's not entirely clear this lower bound is the best
- * way to avoid the problem. Is it possible to drop the lower
- * bound and still avoid trouble with BSD stacks? Perhaps
- * some modification to the RTO calculation that takes delayed
- * ack bais into account? This needs serious thought. -- erics
- */
if (tp->rto < HZ/5)
tp->rto = HZ/5;
+}
+
+/* WARNING: this must not be called if tp->saw_timestamp was false. */
+
+extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq)
+{
+ /* From draft-ietf-tcplw-high-performance: the correct
+ * test is last_ack_sent <= end_seq.
+ * (RFC1323 stated last_ack_sent < end_seq.)
+ */
+ if (!before(end_seq,tp->last_ack_sent)) {
+ tp->ts_recent = tp->rcv_tsval;
+ /* FIXME: need a corse timestamp. Days uptime
+ * would be good.
+ */
+ tp->ts_recent_stamp = jiffies;
+ }
+}
- tp->backoff = 0;
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
+{
+ /* FIXME: must check that ts_recent is not
+ * more than 24 days old here. Yuck.
+ */
+ return (tp->rcv_tsval-tp->ts_recent < 0);
}
+
static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
- u32 end_window;
+ u32 end_window = tp->rcv_wup + tp->rcv_wnd;
- end_window = tp->rcv_wup + tp->rcv_wnd;
-
- if (tp->rcv_wnd)
- {
+ if (tp->rcv_wnd) {
if (!before(seq, tp->rcv_nxt) && before(seq, end_window))
return 1;
@@ -196,9 +229,8 @@
extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
if (seq == tp->rcv_nxt)
- {
return (tp->rcv_wnd || (end_seq == seq));
- }
+
return __tcp_sequence(tp, seq, end_seq);
}
@@ -210,9 +242,8 @@
static int tcp_reset(struct sock *sk, struct sk_buff *skb)
{
sk->zapped = 1;
- /*
- * We want the right error as BSD sees it (and indeed as we do).
- */
+
+ /* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->state) {
case TCP_TIME_WAIT:
break;
@@ -224,7 +255,7 @@
break;
default:
sk->err = ECONNRESET;
- }
+ };
#ifdef CONFIG_TCP_RFC1337
/*
* Time wait assassination protection [RFC1337]
@@ -234,8 +265,7 @@
* Ian Heavens has since shown this is an inadequate fix for the protocol
* bug in question.
*/
- if(sk->state!=TCP_TIME_WAIT)
- {
+ if(sk->state!=TCP_TIME_WAIT) {
tcp_set_state(sk,TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
}
@@ -249,34 +279,30 @@
return(0);
}
-
/*
- * Look for tcp options. Parses everything but only knows about MSS.
- * This routine is always called with the packet containing the SYN.
- * However it may also be called with the ack to the SYN. So you
- * can't assume this is always the SYN. It's always called after
- * we have set up sk->mtu to our own MTU.
- *
- * We need at minimum to add PAWS support here. Possibly large windows
- * as Linux gets deployed on 100Mb/sec networks.
+ * Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ * FIXME: surely this can be more efficient. -- erics
*/
-int tcp_parse_options(struct tcphdr *th)
+void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp)
{
unsigned char *ptr;
int length=(th->doff*4)-sizeof(struct tcphdr);
- int mss = 0;
ptr = (unsigned char *)(th + 1);
+ tp->sacks = 0;
+ tp->saw_tstamp = 0;
- while(length>0)
- {
+ while(length>0) {
int opcode=*ptr++;
int opsize=*ptr++;
- switch(opcode)
- {
+ if (length - opsize < 0) /* Don't parse partial options */
+ break;
+ switch(opcode) {
case TCPOPT_EOL:
- return 0;
+ return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
ptr--; /* the opsize=*ptr++ above was a mistake */
@@ -284,25 +310,86 @@
default:
if(opsize<=2) /* Avoid silly options looping forever */
- return 0;
- switch(opcode)
- {
+ return;
+ switch(opcode) {
case TCPOPT_MSS:
- if(opsize==TCPOLEN_MSS && th->syn)
- {
- mss = ntohs(*(unsigned short *)ptr);
- }
+ if(opsize==TCPOLEN_MSS && th->syn) {
+ tp->in_mss = ntohs(*(__u16 *)ptr);
+ if (tp->in_mss == 0)
+ tp->in_mss = 536;
+ }
break;
- /* Add other options here as people feel the urge to implement stuff like large windows */
+ case TCPOPT_WINDOW:
+ if(opsize==TCPOLEN_WINDOW && th->syn)
+ if (sysctl_tcp_window_scaling)
+ tp->snd_wscale = *(__u8 *)ptr;
+ break;
+ case TCPOPT_SACK_PERM:
+ if(opsize==TCPOLEN_SACK_PERM && th->syn)
+ if (sysctl_tcp_sack)
+ tp->sack_ok = 1;
+ case TCPOPT_TIMESTAMP:
+ if(opsize==TCPOLEN_TIMESTAMP) {
+ /* Cheaper to set again then to
+ * test syn. Optimize this?
+ */
+ if (sysctl_tcp_timestamps)
+ tp->tstamp_ok = 1;
+ tp->saw_tstamp = 1;
+ tp->rcv_tsval = ntohl(*(__u32 *)ptr);
+ tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
+ }
+ break;
+ case TCPOPT_SACK:
+ tp->sacks = (opsize-2)>>3;
+ if (tp->sacks<<3 == opsize-2) {
+ int i;
+ for (i = 0; i < tp->sacks; i++) {
+ tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]);
+ tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]);
+ }
+ } else
+ tp->sacks = 0;
}
ptr+=opsize-2;
length-=opsize;
- }
+ };
}
+}
- return mss;
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_option().
+ * This should probably get extended for timestamps + SACK as well.
+ * Assembly code anyone? -- erics
+ */
+static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp)
+{
+ if (tp->tcp_header_len == sizeof(struct tcphdr))
+ return 0;
+ if (th->doff == sizeof(struct tcphdr)>>2) {
+ tp->saw_tstamp = 0;
+ tp->sacks = 0;
+ return 0;
+ } else if (th->doff == (sizeof(struct tcphdr)>>2)+3) {
+ __u32 *ptr = (__u32 *)(th + 1);
+ if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ tp->saw_tstamp = 1;
+ tp->sacks = 0;
+ tp->rcv_tsval = ntohl(*++ptr);
+ tp->rcv_tsecr = ntohl(*++ptr);
+ return 1;
+ }
+ }
+ tcp_parse_options(th,tp);
+ return 1;
}
+#if 0
+
+/*
+ * This is the old fast retransmit code. It will go away eventually. -- erics
+ */
/*
* See draft-stevens-tcpca-spec-01 for documentation.
@@ -332,62 +419,170 @@
* The packet acked data after high_seq;
*/
- if (ack == tp->snd_una && atomic_read(&sk->packets_out) && (not_dup == 0))
- {
- /*
- * 1. When the third duplicate ack is received, set ssthresh
- * to one half the current congestion window, but no less
- * than two segments. Retransmit the missing segment.
+ if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
+ /* 1. When the third duplicate ack is received, set ssthresh
+ * to one half the current congestion window, but no less
+ * than two segments. Retransmit the missing segment.
*/
+ if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
+ tp->dup_acks++;
- if (tp->high_seq == 0 || after(ack, tp->high_seq))
- {
- sk->dup_acks++;
-
- if (sk->dup_acks == 3)
- {
- sk->ssthresh = max(tp->snd_cwnd >> 1, 2);
- tp->snd_cwnd = sk->ssthresh + 3;
+ if (tp->dup_acks == 3) {
+ tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
+ tp->snd_cwnd = tp->snd_ssthresh + 3;
tcp_do_retransmit(sk, 0);
- /* careful not to timeout just after fast
+
+ /* Careful not to timeout just after fast
* retransmit!
*/
- tcp_reset_xmit_timer(sk, TIME_RETRANS,
- tp->rto);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
}
}
- /*
- * 2. Each time another duplicate ACK arrives, increment
- * cwnd by the segment size. [...] Transmit a packet...
+ /* 2. Each time another duplicate ACK arrives, increment
+ * cwnd by the segment size. [...] Transmit a packet...
*
- * Packet transmission will be done on normal flow processing
- * since we're not in "retransmit mode"
+ * Packet transmission will be done on normal flow processing
+ * since we're not in "retransmit mode".
*/
-
- if (sk->dup_acks >= 3)
- {
- sk->dup_acks++;
+ if (tp->dup_acks >= 3) {
+ tp->dup_acks++;
tp->snd_cwnd++;
}
- }
- else
- {
- /*
- * 3. When the next ACK arrives that acknowledges new data,
- * set cwnd to ssthresh
+ } else {
+ /* 3. When the next ACK arrives that acknowledges new data,
+ * set cwnd to ssthresh.
*/
-
- if (sk->dup_acks >= 3)
- {
+ if (tp->dup_acks >= 3) {
tp->retrans_head = NULL;
- tp->snd_cwnd = max(sk->ssthresh, 1);
- atomic_set(&sk->retransmits, 0);
+ tp->snd_cwnd = max(tp->snd_ssthresh, 1);
+ tp->retransmits = 0;
}
- sk->dup_acks = 0;
+ tp->dup_acks = 0;
+
+ /* FIXME: This is wrong if the new ack that arrives
+ * is below the value for high_seq.
+ */
tp->high_seq = 0;
}
}
+#endif
+
+#define FLAG_DATA 0x01
+#define FLAG_WIN_UPDATE 0x02
+#define FLAG_DATA_ACKED 0x04
+
+static __inline__ void clear_fast_retransmit(struct sock *sk) {
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ if (tp->dup_acks > 3) {
+ tp->retrans_head = NULL;
+ tp->snd_cwnd = max(tp->snd_ssthresh, 1);
+ }
+ tp->dup_acks = 0;
+}
+
+/*
+ * NOTE: This code assumes that tp->dup_acks gets cleared when a
+ * retransmit timer fires.
+ */
+
+static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
+{
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+
+ /*
+ * Note: If not_dup is set this implies we got a
+ * data carrying packet or a window update.
+ * This carries no new information about possible
+ * lost packets, so we have to ignore it for the purposes
+ * of counting duplicate acks. Ideally this does not imply we
+ * should stop our fast retransmit phase, more acks may come
+ * later without data to help us. Unfortunately this would make
+ * the code below much more complex. For now if I see such
+ * a packet I clear the fast retransmit phase.
+ */
+
+ if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
+ /* This is the standard reno style fast retransmit branch. */
+
+ /* 1. When the third duplicate ack is received, set ssthresh
+ * to one half the current congestion window, but no less
+ * than two segments. Retransmit the missing segment.
+ */
+ if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
+ tp->dup_acks++;
+ if (tp->dup_acks == 3) {
+ tp->dup_acks++;
+ tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
+ tp->snd_cwnd = tp->snd_ssthresh + 3;
+ tp->high_seq = tp->snd_nxt;
+ tcp_do_retransmit(sk, 0);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ }
+ }
+
+ /* 2. Each time another duplicate ACK arrives, increment
+ * cwnd by the segment size. [...] Transmit a packet...
+ *
+ * Packet transmission will be done on normal flow processing
+ * since we're not in "retransmit mode"
+ */
+ if (tp->dup_acks > 3)
+ tp->snd_cwnd++;
+ } else if (tp->high_seq != 0) {
+ /* In this branch we deal with clearing the Floyd style
+ * block on duplicate fast retransmits, and if requested
+ * we do Hoe style secondary fast retransmits.
+ */
+ if (!before(ack,tp->high_seq) || (not_dup&FLAG_DATA) != 0) {
+ /* Once we have acked all the packets up to high_seq
+ * we are done this fast retransmit phase.
+ * Alternatively data arrived. In this case we
+ * Have to abort the fast retransmit attempt.
+ * Note that we do want to accept a window
+ * update since this is expected with Hoe's algorithm.
+ */
+ clear_fast_retransmit(sk);
+
+ /* After we have cleared up to high_seq we can
+ * clear the Floyd style block.
+ */
+ if (after(ack,tp->high_seq))
+ tp->high_seq = 0;
+ } else if (tp->dup_acks >= 3) {
+ if (sysctl_tcp_hoe_retransmits) {
+ /* Hoe Style. We didn't ack the whole
+ * window. Take this as a cue that
+ * another packet was lost and retransmit it.
+ * Don't muck with the congestion window here.
+ * Note that we have to be careful not to
+ * act if this was a window update and it
+ * didn't ack new data, since this does
+ * not indicate a packet left the system.
+ * We can test this by just checking
+ * if ack changed from snd_una, since
+ * the only way to get here without changing
+ * advancing from snd_una is if this was a
+ * window update.
+ */
+ if (ack != tp->snd_una && before(ack,tp->high_seq)) {
+ tcp_do_retransmit(sk, 0);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS,
+ tp->rto);
+ }
+ } else {
+ /* Reno style. We didn't ack the whole
+ * window, now we have to drop out of
+ * fast retransmit and wait for a timeout.
+ */
+ clear_fast_retransmit(sk);
+ }
+ }
+ } else {
+ /* Clear any aborted fast retransmit starts. */
+ tp->dup_acks = 0;
+ }
+}
/*
* TCP slow start and congestion avoidance in two flavors:
@@ -401,24 +596,20 @@
static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
u32 seq_rtt)
{
- struct tcp_opt * tp;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
unsigned int actual, expected;
unsigned int inv_rtt, inv_basertt, inv_basebd;
u32 snt_bytes;
- /*
- * From:
+ /* From:
* TCP Vegas: New Techniques for Congestion
* Detection and Avoidance.
*
- *
* Warning: This code is a scratch implementation taken
* from the paper only. The code they distribute seams
* to have improved several things over the initial spec.
*/
- tp = &(sk->tp_pinfo.af_tcp);
-
if (!seq_rtt)
seq_rtt = 1;
@@ -427,11 +618,8 @@
else
tp->basertt = seq_rtt;
- /*
- *
- * actual = throughput for this segment.
+ /* actual = throughput for this segment.
* expected = number_of_bytes in transit / BaseRTT
- *
*/
snt_bytes = ack - seq;
@@ -443,55 +631,36 @@
expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
+ /* XXX sk->mss should move into tcp_opt as well -DaveM */
inv_basebd = sk->mss * inv_basertt;
- /*
- * Slow Start
- */
-
- if (tp->snd_cwnd < sk->ssthresh &&
+ /* Slow Start */
+ if (tp->snd_cwnd < tp->snd_ssthresh &&
(seq == tp->snd_nxt ||
- (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd)))
- {
- /*
- * "Vegas allows exponential growth only every other
- * RTT"
- */
-
- if (sk->cong_count++)
- {
+ (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) {
+ /* "Vegas allows exponential growth only every other RTT" */
+ if (tp->snd_cwnd_cnt++) {
tp->snd_cwnd++;
- sk->cong_count = 0;
+ tp->snd_cwnd_cnt = 0;
}
- }
- else
- {
- /*
- * Congestion Avoidance
- */
-
- if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd)
- {
+ } else {
+ /* Congestion Avoidance */
+ if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) {
/* Increase Linearly */
-
- if (sk->cong_count++ >= tp->snd_cwnd)
- {
+ if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
tp->snd_cwnd++;
- sk->cong_count = 0;
+ tp->snd_cwnd_cnt = 0;
}
}
- if (expected - actual >= TCP_VEGAS_BETA * inv_basebd)
- {
+ if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) {
/* Decrease Linearly */
-
- if (sk->cong_count++ >= tp->snd_cwnd)
- {
+ if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
tp->snd_cwnd--;
- sk->cong_count = 0;
+ tp->snd_cwnd_cnt = 0;
}
- /* Never less than 2 segments */
+ /* Never less than 2 segments. */
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
}
@@ -500,17 +669,16 @@
static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
{
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * This is Jacobson's slow start and congestion avoidance.
+ /* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328. Because we keep cong_window in
* integral mss's, we can't do cwnd += 1 / cwnd.
* Instead, maintain a counter and increment it once every
* cwnd times.
* FIXME: Check to be sure the mathematics works out right
* on this trick when we have to reduce the congestion window.
- * The cong_count has to be reset properly when reduction events
+ * The snd_cwnd_cnt has to be reset properly when reduction events
* happen.
* FIXME: What happens when the congestion window gets larger
* than the maximum receiver window by some large factor
@@ -520,38 +688,22 @@
* be reduced to is not clear, since 1/2 the old window may
* still be larger than the maximum sending rate we ever achieved.
*/
-
- if (tp->snd_cwnd <= sk->ssthresh)
- {
- /*
- * In "safe" area, increase
- */
-
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* In "safe" area, increase. */
tp->snd_cwnd++;
- }
- else
- {
- /*
- * In dangerous area, increase slowly.
- * In theory this is
- * tp->snd_cwnd += 1 / tp->snd_cwnd
+ } else {
+ /* In dangerous area, increase slowly. In theory this is
+ * tp->snd_cwnd += 1 / tp->snd_cwnd
*/
-
- if (sk->cong_count >= tp->snd_cwnd) {
-
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
tp->snd_cwnd++;
- sk->cong_count = 0;
- }
- else
- sk->cong_count++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
}
}
-#define FLAG_DATA 0x01
-#define FLAG_WIN_UPDATE 0x02
-#define FLAG_DATA_ACKED 0x04
-
static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
__u32 *seq_rtt)
{
@@ -560,25 +712,18 @@
unsigned long now = jiffies;
int acked = 0;
- while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head))
- {
-
+ while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
#ifdef TCP_DEBUG
/* Check for a bug. */
-
if (skb->next != (struct sk_buff*) &sk->write_queue &&
after(skb->end_seq, skb->next->seq))
- {
printk(KERN_DEBUG "INET: tcp_input.c: *** "
"bug send_list out of order.\n");
- }
#endif
- /*
- * If our packet is before the ack sequence we can
- * discard it as it's confirmed to have arrived the
- * other end.
+ /* If our packet is before the ack sequence we can
+ * discard it as it's confirmed to have arrived the
+ * other end.
*/
-
if (after(skb->end_seq, ack))
break;
@@ -591,7 +736,7 @@
* do packet "repackaging" for stacks that don't
* like overlapping packets.
*/
- atomic_dec(&sk->packets_out);
+ tp->packets_out--;
*seq = skb->seq;
*seq_rtt = now - skb->when;
@@ -601,13 +746,11 @@
kfree_skb(skb, FREE_WRITE);
}
- if (acked)
- {
+ if (acked) {
tp->retrans_head = NULL;
if (!sk->dead)
sk->write_space(sk);
}
-
return acked;
}
@@ -615,27 +758,18 @@
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * Our probe was answered
- */
+ /* Our probe was answered. */
tp->probes_out = 0;
- /*
- * Was it a usable window open ?
- */
+ /* Was it a usable window open? */
/* should always be non-null */
if (tp->send_head != NULL &&
- !before (ack + tp->snd_wnd, tp->send_head->end_seq))
- {
+ !before (ack + tp->snd_wnd, tp->send_head->end_seq)) {
tp->backoff = 0;
tp->pending = 0;
-
tcp_clear_xmit_timer(sk, TIME_PROBE0);
-
- }
- else
- {
+ } else {
tcp_reset_xmit_timer(sk, TIME_PROBE0,
min(tp->rto << tp->backoff, 120*HZ));
}
@@ -648,138 +782,126 @@
static int tcp_ack(struct sock *sk, struct tcphdr *th,
u32 ack_seq, u32 ack, int len)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int flag = 0;
u32 seq = 0;
u32 seq_rtt = 0;
struct sk_buff *skb;
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
if(sk->zapped)
return(1); /* Dead, can't ack any more so why bother */
if (tp->pending == TIME_KEEPOPEN)
- {
tp->probes_out = 0;
- }
tp->rcv_tstamp = jiffies;
- /*
- * If the ack is newer than sent or older than previous acks
- * then we can probably ignore it.
+ /* If the ack is newer than sent or older than previous acks
+ * then we can probably ignore it.
*/
-
if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
goto uninteresting_ack;
- /*
- * If there is data set flag 1
- */
-
- if (len != th->doff*4)
- {
+ /* If there is data set flag 1 */
+ if (len != th->doff*4) {
flag |= FLAG_DATA;
tcp_delack_estimator(tp);
}
- /*
- * Update our send window
- */
+ /* Update our send window. */
- /*
- * This is the window update code as per RFC 793
- * snd_wl{1,2} are used to prevent unordered
- * segments from shrinking the window
+ /* This is the window update code as per RFC 793
+ * snd_wl{1,2} are used to prevent unordered
+ * segments from shrinking the window
*/
-
if (before(tp->snd_wl1, ack_seq) ||
- (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack)))
- {
- unsigned long nwin;
-
- nwin = ntohs(th->window);
- if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd))
- {
+ (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
+ unsigned long nwin = ntohs(th->window);
+
+ if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
flag |= FLAG_WIN_UPDATE;
tp->snd_wnd = nwin;
tp->snd_wl1 = ack_seq;
tp->snd_wl2 = ack;
- if (nwin > sk->max_window)
- sk->max_window = nwin;
+ if (nwin > tp->max_window)
+ tp->max_window = nwin;
}
}
- /*
- * We passed data and got it acked, remove any soft error
- * log. Something worked...
+ /* We passed data and got it acked, remove any soft error
+ * log. Something worked...
*/
-
sk->err_soft = 0;
- /*
- * If this ack opens up a zero window, clear backoff. It was
- * being used to time the probes, and is probably far higher than
- * it needs to be for normal retransmission.
+ /* If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
*/
-
if (tp->pending == TIME_PROBE0)
- {
tcp_ack_probe(sk, ack);
- }
-
- /*
- * See if we can take anything off of the retransmit queue.
- */
+ /* See if we can take anything off of the retransmit queue. */
if (tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt))
flag |= FLAG_DATA_ACKED;
-
- /*
- * if we where retransmiting don't count rtt estimate
- */
-
- if (atomic_read(&sk->retransmits))
- {
- if (atomic_read(&sk->packets_out) == 0)
- atomic_set(&sk->retransmits, 0);
- }
- else
- {
- /*
- * Note that we only reset backoff and rto in the
- * rtt recomputation code. And that doesn't happen
- * if there were retransmissions in effect. So the
- * first new packet after the retransmissions is
- * sent with the backoff still in effect. Not until
- * we get an ack from a non-retransmitted packet do
- * we reset the backoff and rto. This allows us to deal
- * with a situation where the network delay has increased
- * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+ /* If we have a timestamp, we always do rtt estimates. */
+ if (tp->saw_tstamp) {
+ /* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
*/
-
- if (flag & FLAG_DATA_ACKED)
- {
- tcp_rtt_estimator(tp, seq_rtt);
-
- (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ seq_rtt = (jiffies-tp->rcv_tsecr);
+ tcp_rtt_estimator(tp, seq_rtt);
+ if (tp->retransmits) {
+ if (tp->packets_out == 0) {
+ tp->retransmits = 0;
+ tp->backoff = 0;
+ tcp_set_rto(tp);
+ } else {
+ /* Still retransmitting, use backoff */
+ tcp_set_rto(tp);
+ tp->rto = tp->rto << tp->backoff;
+ }
+ } else {
+ tcp_set_rto(tp);
+ if (flag && FLAG_DATA_ACKED)
+ (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ }
+ /* NOTE: safe here so long as cong_ctl doesn't use rto */
+ tcp_bound_rto(tp);
+ } else {
+ /* If we were retransmiting don't count rtt estimate. */
+ if (tp->retransmits) {
+ if (tp->packets_out == 0)
+ tp->retransmits = 0;
+ } else {
+ /* We don't have a timestamp. Can only use
+ * packets that are not retransmitted to determine
+ * rtt estimates. Also, we must not reset the
+ * backoff for rto until we get a non-retransmitted
+ * packet. This allows us to deal with a situation
+ * where the network delay has increased suddenly.
+ * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+ */
+ if (flag & FLAG_DATA_ACKED) {
+ tp->backoff = 0;
+ tcp_rtt_estimator(tp, seq_rtt);
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+ (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+ }
}
}
- if (atomic_read(&sk->packets_out))
- {
- if (flag & FLAG_DATA_ACKED)
- {
+ if (tp->packets_out) {
+ if (flag & FLAG_DATA_ACKED) {
long when;
skb = skb_peek(&sk->write_queue);
when = tp->rto - (jiffies - skb->when);
- /*
- * FIXME: This assumes that when we are retransmitting
+ /* FIXME: This assumes that when we are retransmitting
* we should only ever respond with one packet.
* This means congestion windows should not grow
* during recovery. In 2.0.X we allow the congestion
@@ -791,36 +913,23 @@
* we have to fix the call to congestion window
* updates so that it works during retransmission.
*/
-
- if (atomic_read(&sk->retransmits))
- {
+ if (tp->retransmits) {
tp->retrans_head = NULL;
- /*
- * This is tricky. We are retransmiting a
+
+ /* This is tricky. We are retransmiting a
* segment of a window when congestion occured.
*/
tcp_do_retransmit(sk, 0);
- tcp_reset_xmit_timer(sk, TIME_RETRANS,
- tp->rto);
- }
- else
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ } else
tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
}
- }
- else
+ } else
tcp_clear_xmit_timer(sk, TIME_RETRANS);
-
- /* FIXME: danger, if we just did a timeout and got the third
- * ack on this packet, then this is going to send it again!
- * [No. Floyd retransmit war check keeps this from happening. -- erics]
- */
tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
- /*
- * Remember the highest ack received.
- */
-
+ /* Remember the highest ack received. */
tp->snd_una = ack;
return 1;
@@ -828,11 +937,9 @@
uninteresting_ack:
SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
-
return 0;
}
-
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
@@ -846,53 +953,46 @@
* close and we go into CLOSING (and later onto TIME-WAIT)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
- *
*/
static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
- sk->fin_seq = skb->end_seq;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* XXX This fin_seq thing should disappear... -DaveM */
+ tp->fin_seq = skb->end_seq;
tcp_send_ack(sk);
- if (!sk->dead)
- {
+ if (!sk->dead) {
sk->state_change(sk);
sock_wake_async(sk->socket, 1);
}
- switch(sk->state)
- {
+ switch(sk->state) {
case TCP_SYN_RECV:
case TCP_SYN_SENT:
case TCP_ESTABLISHED:
- /*
- * move to CLOSE_WAIT
- */
-
+ /* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
-
if (th->rst)
sk->shutdown = SHUTDOWN_MASK;
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
- /*
- * received a retransmission of the FIN, do
+ /* Received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_TIME_WAIT:
- /*
- * received a retransmission of the FIN,
+ /* Received a retransmission of the FIN,
* restart the TIME_WAIT timer.
*/
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
return(0);
case TCP_FIN_WAIT1:
- /*
- * This case occurs when a simultaneous close
+ /* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*
@@ -902,47 +1002,39 @@
* FIN lost hang). The TIME_WRITE code is already
* correct for handling this timeout.
*/
-
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
- /*
- * received a FIN -- send ACK and enter TIME_WAIT
- */
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
- sk->shutdown|=SHUTDOWN_MASK;
+ sk->shutdown |= SHUTDOWN_MASK;
tcp_set_state(sk,TCP_TIME_WAIT);
break;
case TCP_CLOSE:
- /*
- * already in CLOSE
- */
+ /* Already in CLOSE. */
break;
default:
+ /* FIXME: Document whats happening in this case. -DaveM */
tcp_set_state(sk,TCP_LAST_ACK);
/* Start the timers. */
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
return(0);
- }
+ };
return(0);
}
-
-
- /*
- * This one checks to see if we can put data from the
- * out_of_order queue into the receive_queue
- */
-
-static void tcp_ofo_queue(struct sock *sk)
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
{
- struct sk_buff * skb;
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ /* FIXME: out_of_order_queue is a strong tcp_opt candidate... -DaveM */
while ((skb = skb_peek(&sk->out_of_order_queue))) {
-
if (after(skb->seq, tp->rcv_nxt))
break;
@@ -950,106 +1042,59 @@
SOCK_DEBUG(sk, "ofo packet was allready received \n");
skb_unlink(skb);
kfree_skb(skb, FREE_READ);
-
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
tp->rcv_nxt, skb->seq, skb->end_seq);
skb_unlink(skb);
-
skb_queue_tail(&sk->receive_queue, skb);
-
tp->rcv_nxt = skb->end_seq;
}
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
- struct sk_buff * skb1;
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb1;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * Queue data for delivery to the user
- * Packets in sequence go to the receive queue
- * Out of sequence packets to out_of_order_queue
+ /* Queue data for delivery to the user.
+ * Packets in sequence go to the receive queue.
+ * Out of sequence packets to out_of_order_queue.
*/
-
-
if (skb->seq == tp->rcv_nxt) {
-
- /*
- * Ok. In sequence.
- */
-
-
+ /* Ok. In sequence. */
+queue_and_out:
skb_queue_tail(&sk->receive_queue, skb);
-
-
tp->rcv_nxt = skb->end_seq;
-
tcp_ofo_queue(sk);
-
if (skb_queue_len(&sk->out_of_order_queue) == 0)
tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
-
return;
}
- /*
- * Not in sequence
- * either a retransmit or some packet got lost
- */
-
+ /* Not in sequence, either a retransmit or some packet got lost. */
if (!after(skb->end_seq, tp->rcv_nxt)) {
-
- /*
- * A retransmit.
- * 2nd most common case.
- * force an imediate ack
- */
+ /* A retransmit, 2nd most common case. Force an imediate ack. */
SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
- sk->delayed_acks = MAX_DELAY_ACK;
+ tp->delayed_acks = MAX_DELAY_ACK;
kfree_skb(skb, FREE_READ);
-
return;
}
-
if (before(skb->seq, tp->rcv_nxt)) {
-
- /*
- * Partial packet
- * seq < rcv_next < end_seq
- */
+ /* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, skb->seq, skb->end_seq);
- skb_queue_tail(&sk->receive_queue, skb);
-
- tp->rcv_nxt = skb->end_seq;
-
- tcp_ofo_queue(sk);
-
- if (skb_queue_len(&sk->out_of_order_queue) == 0)
- tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
-
- return;
+ goto queue_and_out;
}
- /*
- * Ok. This is an out_of_order segment
- */
-
- /* Force an ack */
-
- sk->delayed_acks = MAX_DELAY_ACK;
-
- /*
- * disable header predition
- */
+ /* Ok. This is an out_of_order segment, force an ack. */
+ tp->delayed_acks = MAX_DELAY_ACK;
+ /* Disable header predition. */
tp->pred_flags = 0;
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
@@ -1057,33 +1102,28 @@
if (skb_peek(&sk->out_of_order_queue) == NULL) {
skb_queue_head(&sk->out_of_order_queue,skb);
- }
- else
+ } else {
for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
-
- /* allready there */
- if (skb->seq==skb1->seq && skb->len>=skb1->len)
- {
- skb_append(skb1,skb);
+ /* Already there. */
+ if (skb->seq == skb1->seq && skb->len >= skb1->len) {
+ skb_append(skb1, skb);
skb_unlink(skb1);
- kfree_skb(skb1,FREE_READ);
+ kfree_skb(skb1, FREE_READ);
break;
}
- if (after(skb->seq, skb1->seq))
- {
+ if (after(skb->seq, skb1->seq)) {
skb_append(skb1,skb);
break;
}
- /*
- * See if we've hit the start. If so insert.
- */
+ /* See if we've hit the start. If so insert. */
if (skb1 == skb_peek(&sk->out_of_order_queue)) {
skb_queue_head(&sk->out_of_order_queue,skb);
break;
}
}
+ }
}
@@ -1096,48 +1136,33 @@
static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
{
struct tcphdr *th;
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
th = skb->h.th;
- skb_pull(skb,th->doff*4);
- skb_trim(skb,len-(th->doff*4));
+ skb_pull(skb, th->doff*4);
+ skb_trim(skb, len - (th->doff*4));
if (skb->len == 0 && !th->fin)
- {
return(0);
- }
- /*
- * FIXME: don't accept data after the receved fin
- */
-
- /*
- * The bytes in the receive read/assembly queue has increased.
- * Needed for the low memory discard algorithm
- */
-
- sk->bytes_rcv += skb->len;
-
- /*
- * We no longer have anyone receiving data on this connection.
+ /* FIXME: don't accept data after the received fin.
+ *
+ * Would checking snd_seq against fin_seq be enough?
+ * If so, how do we handle that case exactly? -DaveM
*/
+ /* We no longer have anyone receiving data on this connection. */
tcp_data_queue(sk, skb);
- if (before(tp->rcv_nxt, sk->copied_seq))
- {
+ if (before(tp->rcv_nxt, sk->copied_seq)) {
printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
tp->rcv_nxt = sk->copied_seq;
}
- sk->delayed_acks++;
-
- /*
- * Now tell the user we may have some data.
- */
+ tp->delayed_acks++;
- if (!sk->dead)
- {
+ /* Now tell the user we may have some data. */
+ if (!sk->dead) {
SOCK_DEBUG(sk, "Data wakeup.\n");
sk->data_ready(sk,0);
}
@@ -1149,29 +1174,25 @@
struct sk_buff *skb;
struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
- if ((skb = tp->send_head))
- {
+ if ((skb = tp->send_head)) {
if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
- atomic_read(&sk->packets_out) < tp->snd_cwnd )
- {
- /*
- * Add more data to the send queue.
- */
+ tp->packets_out < tp->snd_cwnd ) {
+ /* Add more data to the send queue. */
+
/* FIXME: the congestion window is checked
- * again in tcp_write_xmit anyway?!
+ * again in tcp_write_xmit anyway?! -- erics
+ *
+ * I think it must, it bumps tp->packets_out for
+ * each packet it fires onto the wire. -DaveM
*/
-
tcp_write_xmit(sk);
if(!sk->dead)
sk->write_space(sk);
- }
- else if (atomic_read(&sk->packets_out) == 0 && !tp->pending)
- {
- /*
- * Data to queue but no room.
- */
+ } else if (tp->packets_out == 0 && !tp->pending) {
+ /* Data to queue but no room. */
+
/* FIXME: Is it right to do a zero window probe into
- * a congestion window limited window???
+ * a congestion window limited window??? -- erics
*/
tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
}
@@ -1180,32 +1201,25 @@
static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
- /*
- * This also takes care of updating the window.
- * This if statement needs to be simplified.
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* This also takes care of updating the window.
+ * This if statement needs to be simplified.
*
- * rules for delaying an ack:
+ * Rules for delaying an ack:
* - delay time <= 0.5 HZ
* - we don't have a window update to send
* - must send at least every 2 full sized packets
*/
-
- if (sk->delayed_acks == 0)
- {
- /*
- * We sent a data segment already
- */
+ if (tp->delayed_acks == 0) {
+ /* We sent a data segment already. */
return;
}
- if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
- {
+ if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
tcp_send_ack(sk);
- }
else
- {
tcp_send_delayed_ack(sk, HZ/2);
- }
}
/*
@@ -1227,62 +1241,49 @@
ptr--;
ptr += ntohl(th->seq);
- /* ignore urgent data that we've already seen and read */
+ /* Ignore urgent data that we've already seen and read. */
if (after(sk->copied_seq, ptr))
return;
- /* do we already have a newer (or duplicate) urgent pointer? */
+ /* Do we already have a newer (or duplicate) urgent pointer? */
if (sk->urg_data && !after(ptr, sk->urg_seq))
return;
- /* tell the world about our new urgent pointer */
+ /* Tell the world about our new urgent pointer. */
if (sk->proc != 0) {
- if (sk->proc > 0) {
+ if (sk->proc > 0)
kill_proc(sk->proc, SIGURG, 1);
- } else {
+ else
kill_pg(-sk->proc, SIGURG, 1);
- }
}
- /*
- * We may be adding urgent data when the last byte read was
- * urgent. To do this requires some care. We cannot just ignore
- * sk->copied_seq since we would read the last urgent byte again
- * as data, nor can we alter copied_seq until this data arrives
- * or we break the sematics of SIOCATMARK (and thus sockatmark())
+
+ /* We may be adding urgent data when the last byte read was
+ * urgent. To do this requires some care. We cannot just ignore
+ * sk->copied_seq since we would read the last urgent byte again
+ * as data, nor can we alter copied_seq until this data arrives
+ * or we break the sematics of SIOCATMARK (and thus sockatmark())
*/
if (sk->urg_seq == sk->copied_seq)
sk->copied_seq++; /* Move the copied sequence on correctly */
sk->urg_data = URG_NOTYET;
sk->urg_seq = ptr;
- /* disable header prediction */
+ /* Disable header prediction. */
tp->pred_flags = 0;
}
-/*
- * This is the 'fast' part of urgent handling.
- */
-
+/* This is the 'fast' part of urgent handling. */
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
- /*
- * Check if we get a new urgent pointer - normally not
- */
-
+ /* Check if we get a new urgent pointer - normally not. */
if (th->urg)
tcp_check_urg(sk,th);
- /*
- * Do we wait for any urgent data? - normally not
- */
-
+ /* Do we wait for any urgent data? - normally not... */
if (sk->urg_data == URG_NOTYET) {
- u32 ptr;
+ u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4);
- /*
- * Is the urgent pointer pointing into this packet?
- */
- ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
+ /* Is the urgent pointer pointing into this packet? */
if (ptr < len) {
sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
if (!sk->dead)
@@ -1291,26 +1292,19 @@
}
}
-
static void prune_queue(struct sock *sk)
{
struct sk_buff * skb;
- /*
- * clean the out_of_order queue
- */
-
+ /* Clean the out_of_order queue. */
while ((skb = skb_dequeue(&sk->out_of_order_queue)))
- {
kfree_skb(skb, FREE_READ);
- }
}
-
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, __u16 len)
{
- struct tcp_opt *tp;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int queued = 0;
u32 flg;
@@ -1330,6 +1324,23 @@
*/
tp = &(sk->tp_pinfo.af_tcp);
+
+ /*
+ * RFC1323: H1. Apply PAWS check first.
+ */
+ if (tcp_fast_parse_options(th,tp)) {
+ if (tp->saw_tstamp) {
+ if (tcp_paws_discard(tp)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ kfree_skb(skb, FREE_READ);
+ return 0;
+ }
+ }
+ tcp_replace_ts_recent(tp,skb->end_seq);
+ }
+ }
+
flg = *(((u32 *)th) + 3);
/*
@@ -1340,53 +1351,39 @@
* space for instance)
*/
- if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt)
- {
- if (len <= sizeof(struct tcphdr))
- {
- if (len == sizeof(struct tcphdr))
- {
+ if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) {
+ if (len <= th->doff*4) {
+ /* Bulk data transfer: sender */
+ if (len == th->doff*4) {
tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
tcp_data_snd_check(sk);
}
kfree_skb(skb, FREE_READ);
return 0;
- }
- else if (skb->ack_seq == tp->snd_una)
- {
- /*
- * Bulk data transfer: receiver
- */
+ } else if (skb->ack_seq == tp->snd_una) {
+ /* Bulk data transfer: receiver */
- skb_pull(skb,sizeof(struct tcphdr));
+ skb_pull(skb,th->doff*4);
skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = skb->end_seq;
- sk->bytes_rcv += len - sizeof(struct tcphdr);
-
+
sk->data_ready(sk, 0);
tcp_delack_estimator(tp);
- if (sk->delayed_acks++ == 0)
- {
+ if (tp->delayed_acks++ == 0)
tcp_send_delayed_ack(sk, HZ/2);
- }
else
- {
tcp_send_ack(sk);
- }
return 0;
}
}
- if (!tcp_sequence(tp, skb->seq, skb->end_seq))
- {
- if (!th->rst)
- {
- if (after(skb->seq, tp->rcv_nxt))
- {
+ if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+ if (!th->rst) {
+ if (after(skb->seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
skb->seq, skb->end_seq,
tp->rcv_wup, tp->rcv_wnd);
@@ -1397,68 +1394,45 @@
}
}
- if(th->syn && skb->seq != sk->syn_seq)
- {
+ if(th->syn && skb->seq != sk->syn_seq) {
printk(KERN_DEBUG "syn in established state\n");
tcp_reset(sk, skb);
kfree_skb(skb, FREE_READ);
return 1;
}
- if(th->rst)
- {
+ if(th->rst) {
tcp_reset(sk,skb);
kfree_skb(skb, FREE_READ);
return 0;
}
if(th->ack)
- {
tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
- }
-
- /*
- * Process urgent data
- */
-
+ /* Process urgent data. */
tcp_urg(sk, th, len);
- /*
- * step 7: process the segment text
- */
-
-
+ /* step 7: process the segment text */
queued = tcp_data(skb, sk, len);
- /*
- * step 8: check the FIN bit
- */
-
+ /* step 8: check the FIN bit */
if (th->fin)
- {
tcp_fin(skb, sk, th);
- }
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
- /*
- * If our receive queue has grown past its limits,
- * try to prune away duplicates etc..
+ /* If our receive queue has grown past its limits,
+ * try to prune away duplicates etc..
*/
if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
prune_queue(sk);
- /*
- * And done
- */
-
if (!queued)
kfree_skb(skb, FREE_READ);
return 0;
}
-
/*
* This function implements the receiving procedure of RFC 793.
@@ -1471,49 +1445,26 @@
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int queued = 0;
- int rcv_mss;
-
- /*
- * state == CLOSED
- * Hash lookup always fails, so no worries. -DaveM
- */
+ /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */
switch (sk->state) {
-
-
case TCP_LISTEN:
-
if (th->rst)
goto discard;
- /*
- * These use the socket TOS..
+ /* These use the socket TOS..
* might want to be the received TOS
*/
-
if(th->ack)
- {
- /*
- * send reset
- */
-
- return 1;
- }
+ return 1; /* send reset */
-
- if(th->syn)
- {
- int err;
- __u32 isn;
-
- isn = tp->af_specific->init_sequence(sk, skb);
- err = tp->af_specific->conn_request(sk, skb, opt, isn);
+ if(th->syn) {
+ __u32 isn = tp->af_specific->init_sequence(sk, skb);
- if (err < 0)
+ if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
return 1;
- /*
- * Now we have several options: In theory there is
+ /* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
@@ -1525,7 +1476,6 @@
* Now that TTCP is starting to be used we ought to
* queue this data.
*/
-
return 0;
}
@@ -1533,47 +1483,36 @@
break;
case TCP_SYN_SENT:
-
- /*
- * SYN sent means we have to look for a suitable ack and
- * either reset for bad matches or go to connected.
- * The SYN_SENT case is unusual and should
- * not be in line code. [AC]
+ /* SYN sent means we have to look for a suitable ack and
+ * either reset for bad matches or go to connected.
+ * The SYN_SENT case is unusual and should
+ * not be in line code. [AC]
*/
-
- if(th->ack)
- {
+ if(th->ack) {
tp->snd_wl1 = skb->seq;
- /* We got an ack, but it's not a good ack */
- if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len))
- {
+ /* We got an ack, but it's not a good ack. */
+ if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) {
tcp_statistics.TcpAttemptFails++;
return 1;
}
- if(th->rst)
- {
+ if(th->rst) {
tcp_reset(sk,skb);
goto discard;
}
- if(!th->syn)
- {
- /*
- * A valid ack from a different connection
- * start. Shouldn't happen but cover it
+ if(!th->syn) {
+ /* A valid ack from a different connection
+ * start. Shouldn't happen but cover it.
*/
tcp_statistics.TcpAttemptFails++;
return 1;
}
- /*
- * Ok.. it's good. Set up sequence
- * numbers and
- * move to established.
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
*/
-
tp->rcv_nxt = skb->seq+1;
tp->rcv_wnd = 0;
tp->rcv_wup = skb->seq+1;
@@ -1582,39 +1521,53 @@
tp->snd_wl1 = skb->seq;
tp->snd_wl2 = skb->ack_seq;
- sk->fin_seq = skb->seq;
- tcp_send_ack(sk);
+ tp->fin_seq = skb->seq;
tcp_set_state(sk, TCP_ESTABLISHED);
- rcv_mss = tcp_parse_options(th);
-
- if (rcv_mss)
- sk->mss = min(sk->mss, rcv_mss);
+ tcp_parse_options(th,tp);
+ /* FIXME: need to make room for SACK still */
+ if (tp->tstamp_ok) {
+ tp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: Define constant! */
+ sk->dummy_th.doff += 3; /* reserve space of options */
+ } else
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = jiffies;
+ }
+
+ /* Can't be earlier, doff would be wrong. */
+ tcp_send_ack(sk);
+
+ if (tp->in_mss)
+ sk->mss = min(sk->mss, tp->in_mss);
+
+ /* Take out space for tcp options. */
+ sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr);
sk->dummy_th.dest = th->source;
sk->copied_seq = tp->rcv_nxt;
- if(!sk->dead)
- {
+ if(!sk->dead) {
sk->state_change(sk);
sock_wake_async(sk->socket, 0);
}
/* Drop through step 6 */
goto step6;
- }
- else
- {
- if(th->syn && !th->rst)
- {
- /*
- * the previous version of the code
+ } else {
+ if(th->syn && !th->rst) {
+ /* The previous version of the code
* checked for "connecting to self"
* here. that check is done now in
- * tcp_connect
+ * tcp_connect.
*/
-
tcp_set_state(sk, TCP_SYN_RECV);
+ tcp_parse_options(th,tp);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = jiffies;
+ }
tp->rcv_nxt = skb->seq + 1;
tp->rcv_wup = skb->seq + 1;
@@ -1630,8 +1583,7 @@
break;
case TCP_TIME_WAIT:
- /*
- * RFC 1122:
+ /* RFC 1122:
* "When a connection is [...] on TIME-WAIT state [...]
* [a TCP] MAY accept a new SYN from the remote TCP to
* reopen the connection directly, if it:
@@ -1644,11 +1596,8 @@
* (2) returns to TIME-WAIT state if the SYN turns out
* to be an old duplicate".
*/
-
- if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt))
- {
+ if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) {
__u32 isn;
- int err;
skb_orphan(skb);
sk->err = ECONNRESET;
@@ -1664,50 +1613,53 @@
skb_set_owner_r(skb, sk);
tp = &sk->tp_pinfo.af_tcp;
-
- err = tp->af_specific->conn_request(sk, skb, opt, isn);
- if (err < 0)
+ if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
return 1;
-
return 0;
}
break;
-
}
- /*
- * step 1: check sequence number
- */
+ /* Parse the tcp_options present on this header.
+ * By this point we really only expect timestamps and SACKs.
+ * Note that this really has to be here and not later for PAWS
+ * (RFC1323) to work.
+ */
+ if (tcp_fast_parse_options(th,tp)) {
+ /* NOTE: assumes saw_tstamp is never set if we didn't
+ * negotiate the option. tcp_fast_parse_options() must
+ * guarantee this.
+ */
+ if (tp->saw_tstamp) {
+ if (tcp_paws_discard(tp)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
+ }
+ }
+ tcp_replace_ts_recent(tp,skb->end_seq);
+ }
+ }
- if (!tcp_sequence(tp, skb->seq, skb->end_seq))
- {
- if (!th->rst)
- {
+ /* step 1: check sequence number */
+ if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+ if (!th->rst) {
tcp_send_ack(sk);
goto discard;
}
}
-
- /*
- * step 2: check RST bit
- */
-
- if(th->rst)
- {
+ /* step 2: check RST bit */
+ if(th->rst) {
tcp_reset(sk,skb);
goto discard;
}
- /*
- * step 3: check security and precedence
- * [ignored]
- */
+ /* step 3: check security and precedence [ignored] */
- /*
- * step 4:
+ /* step 4:
*
* Check for a SYN, and ensure it matches the SYN we were
* first sent. We have to handle the rather unusual (but valid)
@@ -1723,24 +1675,18 @@
* original syn.
*/
- if (th->syn && skb->seq!=sk->syn_seq)
- {
+ if (th->syn && skb->seq!=sk->syn_seq) {
tcp_reset(sk, skb);
return 1;
}
- /*
- * step 5: check the ACK field
- */
-
- if (th->ack)
- {
+ /* step 5: check the ACK field */
+ if (th->ack) {
int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
switch(sk->state) {
case TCP_SYN_RECV:
- if (acceptable)
- {
+ if (acceptable) {
tcp_set_state(sk, TCP_ESTABLISHED);
sk->dummy_th.dest=th->source;
sk->copied_seq = tp->rcv_nxt;
@@ -1753,15 +1699,12 @@
tp->snd_wl1 = skb->seq;
tp->snd_wl2 = skb->ack_seq;
- }
- else
+ } else
return 1;
break;
case TCP_FIN_WAIT1:
-
- if (tp->snd_una == sk->write_seq)
- {
+ if (tp->snd_una == sk->write_seq) {
sk->shutdown |= SEND_SHUTDOWN;
tcp_set_state(sk, TCP_FIN_WAIT2);
if (!sk->dead)
@@ -1770,17 +1713,12 @@
break;
case TCP_CLOSING:
-
if (tp->snd_una == sk->write_seq)
- {
tcp_time_wait(sk);
- }
break;
case TCP_LAST_ACK:
-
- if (tp->snd_una == sk->write_seq)
- {
+ if (tp->snd_una == sk->write_seq) {
sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
if (!sk->dead)
@@ -1790,49 +1728,34 @@
break;
case TCP_TIME_WAIT:
- /*
- * keep us in TIME_WAIT until we stop getting
+ /* Keep us in TIME_WAIT until we stop getting
* packets, reset the timeout.
*/
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
break;
-
}
- }
- else
+ } else
goto discard;
- step6:
-
- /*
- * step 6: check the URG bit
- */
-
+step6:
+ /* step 6: check the URG bit */
tcp_urg(sk, th, len);
- /*
- * step 7: process the segment text
- */
-
+ /* step 7: process the segment text */
switch (sk->state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
- if (!before(skb->seq, sk->fin_seq))
+ if (!before(skb->seq, tp->fin_seq))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
-
- /*
- * RFC 793 says to queue data in this states,
- * RFC 1122 says we MUST send a reset.
- * BSD 4.4 also does reset.
+ /* RFC 793 says to queue data in these states,
+ * RFC 1122 says we MUST send a reset.
+ * BSD 4.4 also does reset.
*/
-
- if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead)
- {
- if (after(skb->end_seq - th->fin, tp->rcv_nxt))
- {
+ if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
+ if (after(skb->end_seq - th->fin, tp->rcv_nxt)) {
tcp_reset(sk, skb);
return 1;
}
@@ -1843,22 +1766,16 @@
break;
}
- /*
- * step 8: check the FIN bit
- */
-
+ /* step 8: check the FIN bit */
if (th->fin)
- {
tcp_fin(skb, sk, th);
- }
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
if (queued)
return 0;
- discard:
-
+discard:
kfree_skb(skb, FREE_READ);
return 0;
}
@@ -1871,19 +1788,18 @@
retv = proc_dointvec(ctl, write, filp, buffer, lenp);
- if (write)
- {
+ if (write) {
switch (sysctl_tcp_cong_avoidance) {
- case 0:
- tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
- break;
- case 1:
- tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas;
- break;
- default:
- retv = -EINVAL;
- sysctl_tcp_cong_avoidance = val;
- }
+ case 0:
+ tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
+ break;
+ case 1:
+ tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas;
+ break;
+ default:
+ retv = -EINVAL;
+ sysctl_tcp_cong_avoidance = val;
+ };
}
return retv;
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov