patch-2.1.28 linux/net/ipv4/tcp_ipv4.c
Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index
- Lines: 637
- Date:
Mon Mar 3 09:37:44 1997
- Orig file:
v2.1.27/linux/net/ipv4/tcp_ipv4.c
- Orig date:
Tue Feb 4 06:44:25 1997
diff -u --recursive --new-file v2.1.27/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c
@@ -22,6 +22,12 @@
* 2 of the License, or (at your option) any later version.
*/
+/*
+ * Changes:
+ * David S. Miller : New socket lookup architecture.
+ * This code is dedicated to John Dyson.
+ */
+
#include <linux/config.h>
#include <linux/types.h>
#include <linux/fcntl.h>
@@ -38,44 +44,360 @@
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb);
-/*
- * Cached last hit socket
+/* This is for sockets with full identity only. Sockets here will always
+ * be without wildcards and will have the following invariant:
+ * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
*/
-
-static volatile unsigned long th_cache_saddr, th_cache_daddr;
-static volatile unsigned short th_cache_dport, th_cache_sport;
-static volatile struct sock *th_cache_sk;
+struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+
+/* All sockets in TCP_LISTEN state will be in here. This is the only table
+ * where wildcard'd TCP sockets can exist. Hash function here is just local
+ * port number. XXX Fix or we'll lose with thousands of IP aliases...
+ */
+struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
-void tcp_cache_zap(void)
+/* Ok, let's try this, I give up, we do need a local binding
+ * TCP hash as well as the others for fast bind/connect.
+ */
+struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
+
+static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+ __u32 faddr, __u16 fport)
{
- th_cache_sk=NULL;
+ return ((laddr ^ lport) ^ (faddr ^ fport)) & (TCP_HTABLE_SIZE - 1);
}
-/*
- * Find the socket, using the last hit cache if applicable.
- * The cache is not quite right...
+static __inline__ int tcp_sk_hashfn(struct sock *sk)
+{
+ __u32 laddr = sk->rcv_saddr;
+ __u16 lport = sk->num;
+ __u32 faddr = sk->daddr;
+ __u16 fport = sk->dummy_th.dest;
+
+ return tcp_hashfn(laddr, lport, faddr, fport);
+}
+
+static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
+{
+ struct sock *sk2;
+ int retval = 0, sk_reuse = sk->reuse;
+
+ SOCKHASH_LOCK();
+ sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
+ for(; sk2 != NULL; sk2 = sk2->prev) {
+ if((sk2->num == snum) && (sk2 != sk)) {
+ unsigned char state = sk2->state;
+ int sk2_reuse = sk2->reuse;
+
+ if(!sk2->rcv_saddr || !sk->rcv_saddr) {
+ if((!sk2_reuse) ||
+ (!sk_reuse) ||
+ (state != TCP_LISTEN)) {
+ retval = 1;
+ break;
+ }
+ } else if(sk2->rcv_saddr == sk->rcv_saddr) {
+ if((!sk_reuse) ||
+ (!sk2_reuse) ||
+ (state == TCP_LISTEN)) {
+ retval = 1;
+ break;
+ }
+ }
+ }
+ }
+ SOCKHASH_UNLOCK();
+
+ return retval;
+}
+
+static __inline__ int tcp_lport_inuse(int num)
+{
+ struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
+
+ for(; sk != NULL; sk = sk->prev) {
+ if(sk->num == num)
+ return 1;
+ }
+ return 0;
+}
+
+/* Find a "good" local port, this is family independant.
+ * There are several strategies working in unison here to
+ * get the best possible performance. The current socket
+ * load is kept track of, if it is zero there is a strong
+ * likely hood that there is a zero length chain we will
+ * find with a small amount of searching, else the load is
+ * what we shoot for for when the chains all have at least
+ * one entry. The base helps us walk the chains in an
+ * order such that a good chain is found as quickly as possible. -DaveM
*/
+unsigned short tcp_good_socknum(void)
+{
+ static int start = PROT_SOCK;
+ static int binding_contour = 0;
+ int best = 0;
+ int size = 32767; /* a big num. */
+ int retval = 0, i, end, bc;
+
+ SOCKHASH_LOCK();
+ i = tcp_bhashfn(start);
+ end = i + TCP_BHTABLE_SIZE;
+ bc = binding_contour;
+ do {
+ struct sock *sk = tcp_bound_hash[tcp_bhashfn(i)];
+ if(!sk) {
+ retval = (start + i);
+ start = (retval + 1);
+
+ /* Check for decreasing load. */
+ if(bc != 0)
+ binding_contour = 0;
+ goto done;
+ } else {
+ int j = 0;
+ do { sk = sk->prev; } while(++j < size && sk);
+ if(j < size) {
+ best = (start + i);
+ size = j;
+ if(bc && size <= bc) {
+ start = best + 1;
+ goto verify;
+ }
+ }
+ }
+ } while(++i != end);
-static inline struct sock * get_tcp_sock(u32 saddr, u16 sport,
- u32 daddr, u16 dport)
+ /* Socket load is increasing, adjust our load average. */
+ binding_contour = size;
+verify:
+ if(size < binding_contour)
+ binding_contour = size;
+
+ if(best > 32767)
+ best -= (32768 - PROT_SOCK);
+
+ while(tcp_lport_inuse(best))
+ best += TCP_BHTABLE_SIZE;
+ retval = best;
+done:
+ if(start > 32767)
+ start -= (32768 - PROT_SOCK);
+
+ SOCKHASH_UNLOCK();
+
+ return retval;
+}
+
+static void tcp_v4_hash(struct sock *sk)
{
- struct sock * sk;
+ unsigned char state;
+
+ SOCKHASH_LOCK();
+ state = sk->state;
+ if(state != TCP_CLOSE || !sk->dead) {
+ struct sock **htable;
+
+ if(state == TCP_LISTEN) {
+ sk->hashent = tcp_sk_listen_hashfn(sk);
+ htable = &tcp_listening_hash[0];
+ } else {
+ sk->hashent = tcp_sk_hashfn(sk);
+ htable = &tcp_established_hash[0];
+ }
+ sk->next = htable[sk->hashent];
+ htable[sk->hashent] = sk;
+ sk->hashtable = htable;
+ tcp_sk_bindify(sk);
+ }
+ SOCKHASH_UNLOCK();
+}
+
+static void tcp_v4_unhash(struct sock *sk)
+{
+ struct sock **htable;
+
+ SOCKHASH_LOCK();
+ htable = sk->hashtable;
+ if(htable) {
+ struct sock **skp = &(htable[sk->hashent]);
+ while(*skp != NULL) {
+ if(*skp == sk) {
+ *skp = sk->next;
+ break;
+ }
+ skp = &((*skp)->next);
+ }
+ sk->hashtable = NULL;
+ }
+ if(sk->state == TCP_CLOSE && sk->dead)
+ tcp_sk_unbindify(sk);
+ SOCKHASH_UNLOCK();
+}
+
+static void tcp_v4_rehash(struct sock *sk)
+{
+ struct sock **htable;
+ unsigned char state;
+
+ SOCKHASH_LOCK();
+ htable = &(sk->hashtable[sk->hashent]);
+ state = sk->state;
+ if(htable) {
+ while(*htable != NULL) {
+ if(*htable == sk) {
+ *htable = sk->next;
+ break;
+ }
+ htable = &((*htable)->next);
+ }
+ }
+ tcp_sk_unbindify(sk);
+ htable = NULL;
+ if(state != TCP_CLOSE || !sk->dead) {
+ if(state == TCP_LISTEN) {
+ sk->hashent = tcp_sk_listen_hashfn(sk);
+ htable = &tcp_listening_hash[0];
+ } else {
+ sk->hashent = tcp_sk_hashfn(sk);
+ htable = &tcp_established_hash[0];
+ }
+ sk->next = htable[sk->hashent];
+ htable[sk->hashent] = sk;
+ tcp_sk_bindify(sk);
+ }
+ sk->hashtable = htable;
+ SOCKHASH_UNLOCK();
+}
- sk = (struct sock *) th_cache_sk;
- if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
- sport != th_cache_sport || dport != th_cache_dport) {
- sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
- if (sk) {
- th_cache_saddr=saddr;
- th_cache_daddr=daddr;
- th_cache_dport=dport;
- th_cache_sport=sport;
- th_cache_sk=sk;
+/* Don't inline this cruft. Here are some nice properties to
+ * exploit here. The BSD API does not allow a listening TCP
+ * to specify the remote port nor the remote address for the
+ * connection. So always assume those are both wildcarded
+ * during the search since they can never be otherwise.
+ *
+ * XXX Later on, hash on both local port _and_ local address,
+ * XXX to handle a huge IP alias'd box. Keep in mind that
+ * XXX such a scheme will require us to run through the listener
+ * XXX hash twice, once for local addresses bound, and once for
+ * XXX the local address wildcarded (because the hash is different).
+ */
+static struct sock *tcp_v4_lookup_longway(u32 daddr, unsigned short hnum)
+{
+ struct sock *sk = tcp_listening_hash[tcp_lhashfn(hnum)];
+ struct sock *result = NULL;
+
+ for(; sk; sk = sk->next) {
+ if(sk->num == hnum) {
+ __u32 rcv_saddr = sk->rcv_saddr;
+
+ if(rcv_saddr) {
+ if(rcv_saddr == daddr)
+ return sk; /* Best possible match. */
+ } else if(!result)
+ result = sk;
}
}
+ return result;
+}
+
+/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ */
+static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
+ u32 saddr, u16 sport, u32 daddr, u16 dport)
+{
+ unsigned short hnum = ntohs(dport);
+ struct sock *sk;
+
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways. It is assumed that this code only
+ * gets called from within NET_BH.
+ */
+ sk = tcp_established_hash[tcp_hashfn(daddr, hnum, saddr, sport)];
+ for(; sk; sk = sk->next)
+ if(sk->daddr == saddr && /* remote address */
+ sk->dummy_th.dest == sport && /* remote port */
+ sk->num == hnum && /* local port */
+ sk->rcv_saddr == daddr) /* local address */
+ goto hit; /* You sunk my battleship! */
+ sk = tcp_v4_lookup_longway(daddr, hnum);
+hit:
return sk;
}
+__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport)
+{
+ return __tcp_v4_lookup(0, saddr, sport, daddr, dport);
+}
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+#define secondlist(hpnum, sk, fpass) \
+({ struct sock *s1; if(!(sk) && (fpass)--) \
+ s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
+ else \
+ s1 = (sk); \
+ s1; \
+})
+
+#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
+ secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
+
+#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
+ secondlist((hpnum),(sk)->next,(fpass))
+
+struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
+ unsigned short rnum, unsigned long laddr,
+ unsigned long paddr, unsigned short pnum)
+{
+ struct sock *s, *result = NULL;
+ int badness = -1;
+ unsigned short hnum = ntohs(num);
+ unsigned short hpnum = ntohs(pnum);
+ int firstpass = 1;
+
+ /* This code must run only from NET_BH. */
+ for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
+ s != NULL;
+ s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
+ if(s->num == hnum || s->num == hpnum) {
+ int score = 0;
+ if(s->dead && (s->state == TCP_CLOSE))
+ continue;
+ if(s->rcv_saddr) {
+ if((s->num != hpnum || s->rcv_saddr != paddr) &&
+ (s->num != hnum || s->rcv_saddr != laddr))
+ continue;
+ score++;
+ }
+ if(s->daddr) {
+ if(s->daddr != raddr)
+ continue;
+ score++;
+ }
+ if(s->dummy_th.dest) {
+ if(s->dummy_th.dest != rnum)
+ continue;
+ score++;
+ }
+ if(score == 3 && s->num == hnum) {
+ result = s;
+ break;
+ } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
+ result = s;
+ badness = score;
+ }
+ }
+ }
+ return result;
+}
+
+#undef secondlist
+#undef tcp_v4_proxy_loop_init
+#undef tcp_v4_proxy_loop_next
+
+#endif
+
static __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
return secure_tcp_sequence_number(sk->saddr, sk->daddr,
@@ -94,27 +416,24 @@
static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
{
- int retval = 1;
+ int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
struct sock * sk;
- /* Make sure we are allowed to connect here. */
- cli();
- for (sk = tcp_prot.sock_array[snum & (SOCK_ARRAY_SIZE -1)];
- sk != NULL; sk = sk->next)
- {
- /* hash collision? */
- if (sk->num != snum)
- continue;
- if (sk->saddr != saddr)
- continue;
- if (sk->daddr != daddr)
- continue;
- if (sk->dummy_th.dest != dnum)
- continue;
- retval = 0;
- break;
+ /* Make sure we are allowed to connect here.
+ * But freeze the hash while we snoop around.
+ */
+ SOCKHASH_LOCK();
+ sk = tcp_established_hash[hashent];
+ for (; sk != NULL; sk = sk->next) {
+ if(sk->daddr == daddr && /* remote address */
+ sk->dummy_th.dest == dnum && /* remote port */
+ sk->num == snum && /* local port */
+ sk->saddr == saddr) { /* local address */
+ retval = 0;
+ break;
+ }
}
- sti();
+ SOCKHASH_UNLOCK();
return retval;
}
@@ -184,6 +503,7 @@
sk->priority = rt->u.dst.priority;
sk->dummy_th.dest = usin->sin_port;
+
sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
sk->dummy_th.source,
usin->sin_port);
@@ -262,14 +582,13 @@
buff->csum = csum_partial(ptr, 4, 0);
tcp_v4_send_check(sk, t1, sizeof(struct tcphdr) + 4, buff);
- /*
- * This must go first otherwise a really quick response
- * will get reset.
- */
-
- tcp_cache_zap();
tcp_set_state(sk,TCP_SYN_SENT);
+ /* Socket identity change complete, no longer
+ * in TCP_CLOSE, so rehash.
+ */
+ tcp_v4_rehash(sk);
+
tp->rto = rt->u.dst.rtt;
tcp_init_xmit_timers(sk);
@@ -348,7 +667,7 @@
int code = skb->h.icmph->code;
struct sock *sk;
- sk = get_sock(&tcp_prot, th->source, iph->daddr, th->dest, iph->saddr);
+ sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest);
if (sk == NULL)
return;
@@ -402,39 +721,13 @@
}
}
-/*
- * This routine computes a TCP checksum.
- *
- * Modified January 1995 from a go-faster DOS routine by
- * Jorge Cwik <jorge@laser.satlink.net>
- */
+/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb)
{
- __u32 saddr = sk->saddr;
- __u32 daddr = sk->daddr;
-#ifdef DEBUG_TCP_CHECK
- u16 check;
-#endif
th->check = 0;
- th->check = tcp_v4_check(th, len, saddr, daddr,
- csum_partial((char *)th, sizeof(*th),
- skb->csum));
-
-#ifdef DEBUG_TCP_CHECK
- check = th->check;
- th->check = 0;
- th->check = tcp_v4_check(th, len, saddr, daddr,
- csum_partial((char *)th,len,0));
- if (check != th->check) {
- static int count = 0;
- if (++count < 10) {
- printk("Checksum %x (%x) from %p\n", th->check, check,
- __builtin_return_address(0));
- printk("TCP=<off:%d a:%d s:%d f:%d> len=%d\n", th->doff*4, th->ack, th->syn, th->fin, len);
- }
- }
-#endif
+ th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
+ csum_partial((char *)th, sizeof(*th), skb->csum));
}
/*
@@ -504,7 +797,7 @@
struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
struct sock *sk;
- sk = get_sock(&tcp_prot, th->dest, iph->saddr, th->source, iph->daddr);
+ sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest);
if (!sk)
return 0;
@@ -717,6 +1010,10 @@
}
memcpy(newsk, sk, sizeof(*newsk));
+
+ /* Or else we die! -DaveM */
+ newsk->sklist_next = NULL;
+
newsk->opt = NULL;
newsk->dst_cache = NULL;
skb_queue_head_init(&newsk->write_queue);
@@ -833,10 +1130,8 @@
newsk->mss = min(req->mss, snd_mss);
- inet_put_sock(newsk->num, newsk);
-
- tcp_cache_zap();
-
+ tcp_v4_hash(newsk);
+ add_to_prot_sklist(newsk);
return newsk;
}
@@ -1009,11 +1304,11 @@
#ifdef CONFIG_IP_TRANSPARENT_PROXY
if (IPCB(skb)->redirport)
- sk = get_sock_proxy(&tcp_prot, th->dest, saddr, th->source, daddr, skb->dev->pa_addr, IPCB(skb)->redirport);
+ sk = tcp_v4_proxy_lookup(th->dest, saddr, th->source, daddr,
+ skb->dev->pa_addr, IPCB(skb)->redirport);
else
#endif
- sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
-
+ sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1084,9 +1379,8 @@
static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
{
- struct sock *sk;
- sk = get_tcp_sock(skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, th->dest);
- return sk;
+ return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, th->dest);
}
static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
@@ -1197,26 +1491,33 @@
}
struct proto tcp_prot = {
- tcp_close,
- tcp_v4_connect,
- tcp_accept,
- NULL,
- tcp_write_wakeup,
- tcp_read_wakeup,
- tcp_poll,
- tcp_ioctl,
- tcp_v4_init_sock,
- tcp_v4_destroy_sock,
- tcp_shutdown,
- tcp_setsockopt,
- tcp_getsockopt,
- tcp_v4_sendmsg,
- tcp_recvmsg,
- NULL, /* No special bind() */
- tcp_v4_backlog_rcv,
- 128,
- 0,
- "TCP",
- 0, 0,
- NULL
+ (struct sock *)&tcp_prot, /* sklist_next */
+ (struct sock *)&tcp_prot, /* sklist_prev */
+ tcp_close, /* close */
+ tcp_v4_connect, /* connect */
+ tcp_accept, /* accept */
+ NULL, /* retransmit */
+ tcp_write_wakeup, /* write_wakeup */
+ tcp_read_wakeup, /* read_wakeup */
+ tcp_poll, /* poll */
+ tcp_ioctl, /* ioctl */
+ tcp_v4_init_sock, /* init */
+ tcp_v4_destroy_sock, /* destroy */
+ tcp_shutdown, /* shutdown */
+ tcp_setsockopt, /* setsockopt */
+ tcp_getsockopt, /* getsockopt */
+ tcp_v4_sendmsg, /* sendmsg */
+ tcp_recvmsg, /* recvmsg */
+ NULL, /* bind */
+ tcp_v4_backlog_rcv, /* backlog_rcv */
+ tcp_v4_hash, /* hash */
+ tcp_v4_unhash, /* unhash */
+ tcp_v4_rehash, /* rehash */
+ tcp_good_socknum, /* good_socknum */
+ tcp_v4_verify_bind, /* verify_bind */
+ 128, /* max_header */
+ 0, /* retransmits */
+ "TCP", /* name */
+ 0, /* inuse */
+ 0 /* highestinuse */
};
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov