1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp_input.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * FIXES
23 * Pedro Roque : Double ACK bug
24 */
25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 #include <linux/interrupt.h>
30
31 /*
32 * Policy code extracted so its now seperate
33 */
34
35 /*
36 * Called each time to estimate the delayed ack timeout. This is
37 * how it should be done so a fast link isnt impacted by ack delay.
38 */
39
40 extern __inline__ void tcp_delack_estimator(struct sock *sk)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
41 {
42 /*
43 * Delayed ACK time estimator.
44 */
45
46 if (sk->lrcvtime == 0)
47 {
48 sk->lrcvtime = jiffies;
49 sk->ato = HZ/3;
50 }
51 else
52 {
53 int m;
54
55 m = jiffies - sk->lrcvtime;
56
57 sk->lrcvtime = jiffies;
58
59 if (m <= 0)
60 m = 1;
61
62 if (m > (sk->rtt >> 3))
63 {
64 sk->ato = sk->rtt >> 3;
65 /*
66 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
67 */
68 }
69 else
70 {
71 sk->ato = (sk->ato >> 1) + m;
72 /*
73 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
74 */
75 }
76 }
77 }
78
79 /*
80 * Called on frames that were known _not_ to have been
81 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87].
82 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
83 */
84
85 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
86 {
87 long m;
88 /*
89 * The following amusing code comes from Jacobson's
90 * article in SIGCOMM '88. Note that rtt and mdev
91 * are scaled versions of rtt and mean deviation.
92 * This is designed to be as fast as possible
93 * m stands for "measurement".
94 */
95
96 m = jiffies - oskb->when; /* RTT */
97 if(m<=0)
98 m=1; /* IS THIS RIGHT FOR <0 ??? */
99 m -= (sk->rtt >> 3); /* m is now error in rtt est */
100 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
101 if (m < 0)
102 m = -m; /* m is now abs(error) */
103 m -= (sk->mdev >> 2); /* similar update on mdev */
104 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
105
106 /*
107 * Now update timeout. Note that this removes any backoff.
108 */
109
110 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
111 if (sk->rto > 120*HZ)
112 sk->rto = 120*HZ;
113 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
114 sk->rto = HZ/5;
115 sk->backoff = 0;
116 }
117
118 /*
119 * Cached last hit socket
120 */
121
122 static volatile unsigned long th_cache_saddr, th_cache_daddr;
123 static volatile unsigned short th_cache_dport, th_cache_sport;
124 static volatile struct sock *th_cache_sk;
125
126 void tcp_cache_zap(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
127 {
128 th_cache_sk=NULL;
129 }
130
131 /*
132 * Find the socket, using the last hit cache if applicable. The cache is not quite
133 * right...
134 */
135
136 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
137 {
138 struct sock * sk;
139
140 sk = (struct sock *) th_cache_sk;
141 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
142 sport != th_cache_sport || dport != th_cache_dport) {
143 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
144 if (sk) {
145 th_cache_saddr=saddr;
146 th_cache_daddr=daddr;
147 th_cache_dport=dport;
148 th_cache_sport=sport;
149 th_cache_sk=sk;
150 }
151 }
152 return sk;
153 }
154
155 /*
156 * React to a out-of-window TCP sequence number in an incoming packet
157 */
158
159 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
160 struct options *opt, unsigned long saddr, struct device *dev)
161 {
162 if (th->rst)
163 return;
164
165 /*
166 * Send a reset if we get something not ours and we are
167 * unsynchronized. Note: We don't do anything to our end. We
168 * are just killing the bogus remote connection then we will
169 * connect again and it will work (with luck).
170 */
171
172 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
173 {
174 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
175 return;
176 }
177
178 /*
179 * 4.3reno machines look for these kind of acks so they can do fast
180 * recovery. Three identical 'old' acks lets it know that one frame has
181 * been lost and should be resent. Because this is before the whole window
182 * of data has timed out it can take one lost frame per window without
183 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
184 *
185 * We also should be spotting triple bad sequences.
186 */
187 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
188 return;
189 }
190
191 /*
192 * This functions checks to see if the tcp header is actually acceptable.
193 */
194
195 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
196 {
197 u32 end_window = sk->acked_seq + sk->window;
198 return /* if start is at end of window, end must be too (zero window) */
199 (seq == end_window && seq == end_seq) ||
200 /* if start is before end of window, check for interest */
201 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
202 }
203
204 /*
205 * When we get a reset we do this. This probably is a tcp_output routine
206 * really.
207 */
208
209 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
210 {
211 sk->zapped = 1;
212 /*
213 * We want the right error as BSD sees it (and indeed as we do).
214 */
215 sk->err = ECONNRESET;
216 if (sk->state == TCP_SYN_SENT)
217 sk->err = ECONNREFUSED;
218 if (sk->state == TCP_CLOSE_WAIT)
219 sk->err = EPIPE;
220 #ifdef CONFIG_TCP_RFC1337
221 /*
222 * Time wait assassination protection [RFC1337]
223 *
224 * This is a good idea, but causes more sockets to take time to close.
225 *
226 * Ian Heavens has since shown this is an inadequate fix for the protocol
227 * bug in question.
228 */
229 if(sk->state!=TCP_TIME_WAIT)
230 {
231 tcp_set_state(sk,TCP_CLOSE);
232 sk->shutdown = SHUTDOWN_MASK;
233 }
234 #else
235 tcp_set_state(sk,TCP_CLOSE);
236 sk->shutdown = SHUTDOWN_MASK;
237 #endif
238 if (!sk->dead)
239 sk->state_change(sk);
240 kfree_skb(skb, FREE_READ);
241 return(0);
242 }
243
244
245 /*
246 * Look for tcp options. Parses everything but only knows about MSS.
247 * This routine is always called with the packet containing the SYN.
248 * However it may also be called with the ack to the SYN. So you
249 * can't assume this is always the SYN. It's always called after
250 * we have set up sk->mtu to our own MTU.
251 *
252 * We need at minimum to add PAWS support here. Possibly large windows
253 * as Linux gets deployed on 100Mb/sec networks.
254 */
255
256 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
257 {
258 unsigned char *ptr;
259 int length=(th->doff*4)-sizeof(struct tcphdr);
260 int mss_seen = 0;
261
262 ptr = (unsigned char *)(th + 1);
263
264 while(length>0)
265 {
266 int opcode=*ptr++;
267 int opsize=*ptr++;
268 switch(opcode)
269 {
270 case TCPOPT_EOL:
271 return;
272 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
273 length--;
274 ptr--; /* the opsize=*ptr++ above was a mistake */
275 continue;
276
277 default:
278 if(opsize<=2) /* Avoid silly options looping forever */
279 return;
280 switch(opcode)
281 {
282 case TCPOPT_MSS:
283 if(opsize==4 && th->syn)
284 {
285 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
286 mss_seen = 1;
287 }
288 break;
289 /* Add other options here as people feel the urge to implement stuff like large windows */
290 }
291 ptr+=opsize-2;
292 length-=opsize;
293 }
294 }
295 if (th->syn)
296 {
297 if (! mss_seen)
298 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
299 }
300 #ifdef CONFIG_INET_PCTCP
301 sk->mss = min(sk->max_window >> 1, sk->mtu);
302 #else
303 sk->mss = min(sk->max_window, sk->mtu);
304 sk->max_unacked = 2 * sk->mss;
305 #endif
306 }
307
308
309 /*
310 * This routine handles a connection request.
311 * It should make sure we haven't already responded.
312 * Because of the way BSD works, we have to send a syn/ack now.
313 * This also means it will be harder to close a socket which is
314 * listening.
315 */
316
317 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
318 u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
319 {
320 struct sock *newsk;
321 struct tcphdr *th;
322 struct rtable *rt;
323
324 th = skb->h.th;
325
326 /* If the socket is dead, don't accept the connection. */
327 if (!sk->dead)
328 {
329 sk->data_ready(sk,0);
330 }
331 else
332 {
333 if(sk->debug)
334 printk("Reset on %p: Connect on dead socket.\n",sk);
335 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
336 tcp_statistics.TcpAttemptFails++;
337 kfree_skb(skb, FREE_READ);
338 return;
339 }
340
341 /*
342 * Make sure we can accept more. This will prevent a
343 * flurry of syns from eating up all our memory.
344 *
345 * BSD does some funnies here and allows 3/2 times the
346 * set backlog as a fudge factor. Thats just too gross.
347 */
348
349 if (sk->ack_backlog >= sk->max_ack_backlog)
350 {
351 tcp_statistics.TcpAttemptFails++;
352 kfree_skb(skb, FREE_READ);
353 return;
354 }
355
356 /*
357 * We need to build a new sock struct.
358 * It is sort of bad to have a socket without an inode attached
359 * to it, but the wake_up's will just wake up the listening socket,
360 * and if the listening socket is destroyed before this is taken
361 * off of the queue, this will take care of it.
362 */
363
364 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
365 if (newsk == NULL)
366 {
367 /* just ignore the syn. It will get retransmitted. */
368 tcp_statistics.TcpAttemptFails++;
369 kfree_skb(skb, FREE_READ);
370 return;
371 }
372
373 memcpy(newsk, sk, sizeof(*newsk));
374 newsk->opt = NULL;
375 newsk->ip_route_cache = NULL;
376 if (opt && opt->optlen)
377 {
378 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
379 if (!sk->opt)
380 {
381 kfree_s(newsk, sizeof(struct sock));
382 tcp_statistics.TcpAttemptFails++;
383 kfree_skb(skb, FREE_READ);
384 return;
385 }
386 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
387 {
388 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
389 kfree_s(newsk, sizeof(struct sock));
390 tcp_statistics.TcpAttemptFails++;
391 kfree_skb(skb, FREE_READ);
392 return;
393 }
394 }
395 skb_queue_head_init(&newsk->write_queue);
396 skb_queue_head_init(&newsk->receive_queue);
397 newsk->send_head = NULL;
398 newsk->send_tail = NULL;
399 skb_queue_head_init(&newsk->back_log);
400 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
401 newsk->ato = HZ/3;
402 newsk->rto = TCP_TIMEOUT_INIT;
403 newsk->mdev = 0;
404 newsk->max_window = 0;
405 newsk->cong_window = 1;
406 newsk->cong_count = 0;
407 newsk->ssthresh = 0;
408 newsk->backoff = 0;
409 newsk->blog = 0;
410 newsk->intr = 0;
411 newsk->proc = 0;
412 newsk->done = 0;
413 newsk->partial = NULL;
414 newsk->pair = NULL;
415 newsk->wmem_alloc = 0;
416 newsk->rmem_alloc = 0;
417 newsk->localroute = sk->localroute;
418
419 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
420
421 newsk->err = 0;
422 newsk->shutdown = 0;
423 newsk->ack_backlog = 0;
424 newsk->acked_seq = skb->seq+1;
425 newsk->lastwin_seq = skb->seq+1;
426 newsk->delay_acks = 1;
427 newsk->copied_seq = skb->seq+1;
428 newsk->fin_seq = skb->seq;
429 newsk->state = TCP_SYN_RECV;
430 newsk->timeout = 0;
431 newsk->ip_xmit_timeout = 0;
432 newsk->write_seq = seq;
433 newsk->window_seq = newsk->write_seq;
434 newsk->rcv_ack_seq = newsk->write_seq;
435 newsk->urg_data = 0;
436 newsk->retransmits = 0;
437 newsk->linger=0;
438 newsk->destroy = 0;
439 init_timer(&newsk->timer);
440 newsk->timer.data = (unsigned long)newsk;
441 newsk->timer.function = &net_timer;
442 init_timer(&newsk->retransmit_timer);
443 newsk->retransmit_timer.data = (unsigned long)newsk;
444 newsk->retransmit_timer.function=&tcp_retransmit_timer;
445 newsk->dummy_th.source = skb->h.th->dest;
446 newsk->dummy_th.dest = skb->h.th->source;
447
448 /*
449 * Swap these two, they are from our point of view.
450 */
451
452 newsk->daddr = saddr;
453 newsk->saddr = daddr;
454 newsk->rcv_saddr = daddr;
455
456 put_sock(newsk->num,newsk);
457 newsk->acked_seq = skb->seq + 1;
458 newsk->copied_seq = skb->seq + 1;
459 newsk->socket = NULL;
460
461 /*
462 * Grab the ttl and tos values and use them
463 */
464
465 newsk->ip_ttl=sk->ip_ttl;
466 newsk->ip_tos=skb->ip_hdr->tos;
467
468 /*
469 * Use 512 or whatever user asked for
470 */
471
472 /*
473 * Note use of sk->user_mss, since user has no direct access to newsk
474 */
475
476 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
477 newsk->ip_route_cache = rt;
478
479 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
480 newsk->window_clamp = rt->rt_window;
481 else
482 newsk->window_clamp = 0;
483
484 if (sk->user_mss)
485 newsk->mtu = sk->user_mss;
486 else if (rt)
487 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
488 else
489 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
490
491 /*
492 * But not bigger than device MTU
493 */
494
495 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
496
497 #ifdef CONFIG_SKIP
498
499 /*
500 * SKIP devices set their MTU to 65535. This is so they can take packets
501 * unfragmented to security process then fragment. They could lie to the
502 * TCP layer about a suitable MTU, but its easier to let skip sort it out
503 * simply because the final package we want unfragmented is going to be
504 *
505 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
506 */
507
508 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
509 sk->mtu=skip_pick_mtu(sk->mtu,dev);
510 #endif
511 /*
512 * This will min with what arrived in the packet
513 */
514
515 tcp_options(newsk,skb->h.th);
516
517 tcp_cache_zap();
518 tcp_send_synack(newsk, sk, skb);
519 }
520
521
522 /*
523 * Handle a TCP window that shrunk on us. It shouldn't happen,
524 * but..
525 *
526 * We may need to move packets from the send queue
527 * to the write queue, if the window has been shrunk on us.
528 * The RFC says you are not allowed to shrink your window
529 * like this, but if the other end does, you must be able
530 * to deal with it.
531 */
532 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
533 {
534 struct sk_buff *skb;
535 struct sk_buff *skb2;
536 struct sk_buff *wskb = NULL;
537
538 skb2 = sk->send_head;
539 sk->send_head = NULL;
540 sk->send_tail = NULL;
541
542 /*
543 * This is an artifact of a flawed concept. We want one
544 * queue and a smarter send routine when we send all.
545 */
546 cli();
547 while (skb2 != NULL)
548 {
549 skb = skb2;
550 skb2 = skb->link3;
551 skb->link3 = NULL;
552 if (after(skb->end_seq, window_seq))
553 {
554 if (sk->packets_out > 0)
555 sk->packets_out--;
556 /* We may need to remove this from the dev send list. */
557 if (skb->next != NULL)
558 {
559 skb_unlink(skb);
560 }
561 /* Now add it to the write_queue. */
562 if (wskb == NULL)
563 skb_queue_head(&sk->write_queue,skb);
564 else
565 skb_append(wskb,skb);
566 wskb = skb;
567 }
568 else
569 {
570 if (sk->send_head == NULL)
571 {
572 sk->send_head = skb;
573 sk->send_tail = skb;
574 }
575 else
576 {
577 sk->send_tail->link3 = skb;
578 sk->send_tail = skb;
579 }
580 skb->link3 = NULL;
581 }
582 }
583 sti();
584 }
585
586
587 /*
588 * This routine deals with incoming acks, but not outgoing ones.
589 *
590 * This routine is totally _WRONG_. The list structuring is wrong,
591 * the algorithm is wrong, the code is wrong.
592 */
593
594 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
595 {
596 int flag = 0;
597 u32 window_seq;
598
599 /*
600 * 1 - there was data in packet as well as ack or new data is sent or
601 * in shutdown state
602 * 2 - data from retransmit queue was acked and removed
603 * 4 - window shrunk or data from retransmit queue was acked and removed
604 */
605
606 if(sk->zapped)
607 return(1); /* Dead, cant ack any more so why bother */
608
609 /*
610 * We have dropped back to keepalive timeouts. Thus we have
611 * no retransmits pending.
612 */
613
614 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
615 sk->retransmits = 0;
616
617 /*
618 * If the ack is newer than sent or older than previous acks
619 * then we can probably ignore it.
620 */
621
622 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
623 goto uninteresting_ack;
624
625 /*
626 * If there is data set flag 1
627 */
628
629 if (len != th->doff*4)
630 flag |= 1;
631
632 /*
633 * Have we discovered a larger window
634 */
635 window_seq = ntohs(th->window);
636 if (window_seq > sk->max_window)
637 {
638 sk->max_window = window_seq;
639 #ifdef CONFIG_INET_PCTCP
640 /* Hack because we don't send partial packets to non SWS
641 handling hosts */
642 sk->mss = min(window_seq>>1, sk->mtu);
643 #else
644 sk->mss = min(window_seq, sk->mtu);
645 #endif
646 }
647 window_seq += ack;
648
649 /*
650 * See if our window has been shrunk.
651 */
652 if (after(sk->window_seq, window_seq)) {
653 flag |= 4;
654 tcp_window_shrunk(sk, window_seq);
655 }
656
657 /*
658 * Update the right hand window edge of the host
659 */
660 sk->window_seq = window_seq;
661
662 /*
663 * Pipe has emptied
664 */
665 if (sk->send_tail == NULL || sk->send_head == NULL)
666 {
667 sk->send_head = NULL;
668 sk->send_tail = NULL;
669 sk->packets_out= 0;
670 }
671
672 /*
673 * We don't want too many packets out there.
674 */
675
676 if (sk->ip_xmit_timeout == TIME_WRITE &&
677 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
678 {
679
680 /*
681 * This is Jacobson's slow start and congestion avoidance.
682 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
683 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
684 * counter and increment it once every cwnd times. It's possible
685 * that this should be done only if sk->retransmits == 0. I'm
686 * interpreting "new data is acked" as including data that has
687 * been retransmitted but is just now being acked.
688 */
689 if (sk->cong_window < sk->ssthresh)
690 /*
691 * In "safe" area, increase
692 */
693 sk->cong_window++;
694 else
695 {
696 /*
697 * In dangerous area, increase slowly. In theory this is
698 * sk->cong_window += 1 / sk->cong_window
699 */
700 if (sk->cong_count >= sk->cong_window)
701 {
702 sk->cong_window++;
703 sk->cong_count = 0;
704 }
705 else
706 sk->cong_count++;
707 }
708 }
709
710 /*
711 * Remember the highest ack received.
712 */
713
714 sk->rcv_ack_seq = ack;
715
716 /*
717 * We passed data and got it acked, remove any soft error
718 * log. Something worked...
719 */
720
721 sk->err_soft = 0;
722
723 /*
724 * If this ack opens up a zero window, clear backoff. It was
725 * being used to time the probes, and is probably far higher than
726 * it needs to be for normal retransmission.
727 */
728
729 if (sk->ip_xmit_timeout == TIME_PROBE0)
730 {
731 sk->retransmits = 0; /* Our probe was answered */
732
733 /*
734 * Was it a usable window open ?
735 */
736
737 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
738 ! before (sk->window_seq, sk->write_queue.next->end_seq))
739 {
740 sk->backoff = 0;
741
742 /*
743 * Recompute rto from rtt. this eliminates any backoff.
744 */
745
746 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
747 if (sk->rto > 120*HZ)
748 sk->rto = 120*HZ;
749 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
750 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
751 .2 of a second is going to need huge windows (SIGH) */
752 sk->rto = HZ/5;
753 }
754 }
755
756 /*
757 * See if we can take anything off of the retransmit queue.
758 */
759
760 while(sk->send_head != NULL)
761 {
762 /* Check for a bug. */
763 if (sk->send_head->link3 &&
764 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
765 printk("INET: tcp.c: *** bug send_list out of order.\n");
766
767 /*
768 * If our packet is before the ack sequence we can
769 * discard it as it's confirmed to have arrived the other end.
770 */
771
772 if (before(sk->send_head->end_seq, ack+1))
773 {
774 struct sk_buff *oskb;
775 if (sk->retransmits)
776 {
777 /*
778 * We were retransmitting. don't count this in RTT est
779 */
780 flag |= 2;
781
782 /*
783 * even though we've gotten an ack, we're still
784 * retransmitting as long as we're sending from
785 * the retransmit queue. Keeping retransmits non-zero
786 * prevents us from getting new data interspersed with
787 * retransmissions.
788 */
789
790 if (sk->send_head->link3) /* Any more queued retransmits? */
791 sk->retransmits = 1;
792 else
793 sk->retransmits = 0;
794 }
795 /*
796 * Note that we only reset backoff and rto in the
797 * rtt recomputation code. And that doesn't happen
798 * if there were retransmissions in effect. So the
799 * first new packet after the retransmissions is
800 * sent with the backoff still in effect. Not until
801 * we get an ack from a non-retransmitted packet do
802 * we reset the backoff and rto. This allows us to deal
803 * with a situation where the network delay has increased
804 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
805 */
806
807 /*
808 * We have one less packet out there.
809 */
810
811 if (sk->packets_out > 0)
812 sk->packets_out --;
813
814 oskb = sk->send_head;
815
816 if (!(flag&2)) /* Not retransmitting */
817 tcp_rtt_estimator(sk,oskb);
818 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
819 In this case as we just set it up */
820 cli();
821 oskb = sk->send_head;
822 IS_SKB(oskb);
823 sk->send_head = oskb->link3;
824 if (sk->send_head == NULL)
825 {
826 sk->send_tail = NULL;
827 }
828
829 /*
830 * We may need to remove this from the dev send list.
831 */
832
833 if (oskb->next)
834 skb_unlink(oskb);
835 sti();
836 kfree_skb(oskb, FREE_WRITE); /* write. */
837 if (!sk->dead)
838 sk->write_space(sk);
839 }
840 else
841 {
842 break;
843 }
844 }
845
846 /*
847 * XXX someone ought to look at this too.. at the moment, if skb_peek()
848 * returns non-NULL, we complete ignore the timer stuff in the else
849 * clause. We ought to organize the code so that else clause can
850 * (should) be executed regardless, possibly moving the PROBE timer
851 * reset over. The skb_peek() thing should only move stuff to the
852 * write queue, NOT also manage the timer functions.
853 */
854
855 /*
856 * Maybe we can take some stuff off of the write queue,
857 * and put it onto the xmit queue.
858 */
859 if (skb_peek(&sk->write_queue) != NULL)
860 {
861 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
862 (sk->retransmits == 0 ||
863 sk->ip_xmit_timeout != TIME_WRITE ||
864 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
865 && sk->packets_out < sk->cong_window)
866 {
867 /*
868 * Add more data to the send queue.
869 */
870 flag |= 1;
871 tcp_write_xmit(sk);
872 }
873 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
874 sk->send_head == NULL &&
875 sk->ack_backlog == 0 &&
876 sk->state != TCP_TIME_WAIT)
877 {
878 /*
879 * Data to queue but no room.
880 */
881 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
882 }
883 }
884 else
885 {
886 /*
887 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
888 * from TCP_CLOSE we don't do anything
889 *
890 * from anything else, if there is write data (or fin) pending,
891 * we use a TIME_WRITE timeout, else if keepalive we reset to
892 * a KEEPALIVE timeout, else we delete the timer.
893 *
894 * We do not set flag for nominal write data, otherwise we may
895 * force a state where we start to write itsy bitsy tidbits
896 * of data.
897 */
898
899 switch(sk->state) {
900 case TCP_TIME_WAIT:
901 /*
902 * keep us in TIME_WAIT until we stop getting packets,
903 * reset the timeout.
904 */
905 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
906 break;
907 case TCP_CLOSE:
908 /*
909 * don't touch the timer.
910 */
911 break;
912 default:
913 /*
914 * Must check send_head, write_queue, and ack_backlog
915 * to determine which timeout to use.
916 */
917 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
918 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
919 } else if (sk->keepopen) {
920 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
921 } else {
922 del_timer(&sk->retransmit_timer);
923 sk->ip_xmit_timeout = 0;
924 }
925 break;
926 }
927 }
928
929 /*
930 * We have nothing queued but space to send. Send any partial
931 * packets immediately (end of Nagle rule application).
932 */
933
934 if (sk->packets_out == 0 && sk->partial != NULL &&
935 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
936 {
937 flag |= 1;
938 tcp_send_partial(sk);
939 }
940
941 /*
942 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
943 * we are now waiting for an acknowledge to our FIN. The other end is
944 * already in TIME_WAIT.
945 *
946 * Move to TCP_CLOSE on success.
947 */
948
949 if (sk->state == TCP_LAST_ACK)
950 {
951 if (!sk->dead)
952 sk->state_change(sk);
953 if(sk->debug)
954 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
955 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
956 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
957 {
958 flag |= 1;
959 sk->shutdown = SHUTDOWN_MASK;
960 tcp_set_state(sk,TCP_CLOSE);
961 return 1;
962 }
963 }
964
965 /*
966 * Incoming ACK to a FIN we sent in the case of our initiating the close.
967 *
968 * Move to FIN_WAIT2 to await a FIN from the other end. Set
969 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
970 */
971
972 if (sk->state == TCP_FIN_WAIT1)
973 {
974
975 if (!sk->dead)
976 sk->state_change(sk);
977 if (sk->rcv_ack_seq == sk->write_seq)
978 {
979 flag |= 1;
980 sk->shutdown |= SEND_SHUTDOWN;
981 tcp_set_state(sk, TCP_FIN_WAIT2);
982 }
983 }
984
985 /*
986 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
987 *
988 * Move to TIME_WAIT
989 */
990
991 if (sk->state == TCP_CLOSING)
992 {
993
994 if (!sk->dead)
995 sk->state_change(sk);
996 if (sk->rcv_ack_seq == sk->write_seq)
997 {
998 flag |= 1;
999 tcp_time_wait(sk);
1000 }
1001 }
1002
1003 /*
1004 * Final ack of a three way shake
1005 */
1006
1007 if(sk->state==TCP_SYN_RECV)
1008 {
1009 tcp_set_state(sk, TCP_ESTABLISHED);
1010 tcp_options(sk,th);
1011 sk->dummy_th.dest=th->source;
1012 sk->copied_seq = sk->acked_seq;
1013 if(!sk->dead)
1014 sk->state_change(sk);
1015 if(sk->max_window==0)
1016 {
1017 sk->max_window=32; /* Sanity check */
1018 sk->mss=min(sk->max_window,sk->mtu);
1019 }
1020 }
1021
1022 /*
1023 * I make no guarantees about the first clause in the following
1024 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
1025 * what conditions "!flag" would be true. However I think the rest
1026 * of the conditions would prevent that from causing any
1027 * unnecessary retransmission.
1028 * Clearly if the first packet has expired it should be
1029 * retransmitted. The other alternative, "flag&2 && retransmits", is
1030 * harder to explain: You have to look carefully at how and when the
1031 * timer is set and with what timeout. The most recent transmission always
1032 * sets the timer. So in general if the most recent thing has timed
1033 * out, everything before it has as well. So we want to go ahead and
1034 * retransmit some more. If we didn't explicitly test for this
1035 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1036 * would not be true. If you look at the pattern of timing, you can
1037 * show that rto is increased fast enough that the next packet would
1038 * almost never be retransmitted immediately. Then you'd end up
1039 * waiting for a timeout to send each packet on the retransmission
1040 * queue. With my implementation of the Karn sampling algorithm,
1041 * the timeout would double each time. The net result is that it would
1042 * take a hideous amount of time to recover from a single dropped packet.
1043 * It's possible that there should also be a test for TIME_WRITE, but
1044 * I think as long as "send_head != NULL" and "retransmit" is on, we've
1045 * got to be in real retransmission mode.
1046 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
1047 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1048 * As long as no further losses occur, this seems reasonable.
1049 */
1050
1051 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1052 (((flag&2) && sk->retransmits) ||
1053 (sk->send_head->when + sk->rto < jiffies)))
1054 {
1055 if(sk->send_head->when + sk->rto < jiffies)
1056 tcp_retransmit(sk,0);
1057 else
1058 {
1059 tcp_do_retransmit(sk, 1);
1060 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1061 }
1062 }
1063
1064 return 1;
1065
1066 uninteresting_ack:
1067 if(sk->debug)
1068 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1069
1070 /*
1071 * Keepalive processing.
1072 */
1073
1074 if (after(ack, sk->sent_seq))
1075 {
1076 return 0;
1077 }
1078
1079 /*
1080 * Restart the keepalive timer.
1081 */
1082
1083 if (sk->keepopen)
1084 {
1085 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1086 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1087 }
1088 return 1;
1089 }
1090
1091
1092 /*
1093 * Process the FIN bit. This now behaves as it is supposed to work
1094 * and the FIN takes effect when it is validly part of sequence
1095 * space. Not before when we get holes.
1096 *
1097 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1098 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1099 * TIME-WAIT)
1100 *
1101 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1102 * close and we go into CLOSING (and later onto TIME-WAIT)
1103 *
1104 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1105 *
1106 */
1107
1108 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1109 {
1110 sk->fin_seq = skb->end_seq;
1111
1112 if (!sk->dead)
1113 {
1114 sk->state_change(sk);
1115 sock_wake_async(sk->socket, 1);
1116 }
1117
1118 switch(sk->state)
1119 {
1120 case TCP_SYN_RECV:
1121 case TCP_SYN_SENT:
1122 case TCP_ESTABLISHED:
1123 /*
1124 * move to CLOSE_WAIT, tcp_data() already handled
1125 * sending the ack.
1126 */
1127 tcp_set_state(sk,TCP_CLOSE_WAIT);
1128 if (th->rst)
1129 sk->shutdown = SHUTDOWN_MASK;
1130 break;
1131
1132 case TCP_CLOSE_WAIT:
1133 case TCP_CLOSING:
1134 /*
1135 * received a retransmission of the FIN, do
1136 * nothing.
1137 */
1138 break;
1139 case TCP_TIME_WAIT:
1140 /*
1141 * received a retransmission of the FIN,
1142 * restart the TIME_WAIT timer.
1143 */
1144 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1145 return(0);
1146 case TCP_FIN_WAIT1:
1147 /*
1148 * This case occurs when a simultaneous close
1149 * happens, we must ack the received FIN and
1150 * enter the CLOSING state.
1151 *
1152 * This causes a WRITE timeout, which will either
1153 * move on to TIME_WAIT when we timeout, or resend
1154 * the FIN properly (maybe we get rid of that annoying
1155 * FIN lost hang). The TIME_WRITE code is already correct
1156 * for handling this timeout.
1157 */
1158
1159 if(sk->ip_xmit_timeout != TIME_WRITE)
1160 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1161 tcp_set_state(sk,TCP_CLOSING);
1162 break;
1163 case TCP_FIN_WAIT2:
1164 /*
1165 * received a FIN -- send ACK and enter TIME_WAIT
1166 */
1167 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1168 sk->shutdown|=SHUTDOWN_MASK;
1169 tcp_set_state(sk,TCP_TIME_WAIT);
1170 break;
1171 case TCP_CLOSE:
1172 /*
1173 * already in CLOSE
1174 */
1175 break;
1176 default:
1177 tcp_set_state(sk,TCP_LAST_ACK);
1178
1179 /* Start the timers. */
1180 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1181 return(0);
1182 }
1183
1184 return(0);
1185 }
1186
1187 /*
1188 * Called for each packet when we find a new ACK endpoint sequence in it
1189 */
1190 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1191 {
1192 /*
1193 * When we ack the fin, we do the FIN
1194 * processing.
1195 */
1196 skb->acked = 1;
1197 if (skb->h.th->fin)
1198 tcp_fin(skb,sk,skb->h.th);
1199 return skb->end_seq;
1200 }
1201
1202
1203 /*
1204 * Add a sk_buff to the TCP receive queue, calculating
1205 * the ACK sequence as we go..
1206 */
1207 static void tcp_queue(struct sk_buff * skb, struct sock * sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1208 struct tcphdr *th, unsigned long saddr)
1209 {
1210 struct sk_buff_head * list = &sk->receive_queue;
1211 struct sk_buff * next;
1212 u32 ack_seq;
1213
1214 /*
1215 * Find where the new skb goes.. (This goes backwards,
1216 * on the assumption that we get the packets in order)
1217 */
1218 next = list->prev;
1219 while (next != (struct sk_buff *) list) {
1220 if (!after(next->seq, skb->seq))
1221 break;
1222 next = next->prev;
1223 }
1224 /*
1225 * put it after the packet we found (which
1226 * may be the list-head, but that's fine).
1227 */
1228 __skb_append(next, skb, list);
1229 next = skb->next;
1230
1231 /*
1232 * Did we get anything new to ack?
1233 */
1234 ack_seq = sk->acked_seq;
1235 if (!after(skb->seq, ack_seq) && after(skb->end_seq, ack_seq)) {
1236 ack_seq = tcp_queue_ack(skb, sk);
1237
1238 /*
1239 * Do we have any old packets to ack that the above
1240 * made visible? (Go forward from skb)
1241 */
1242 while (next != (struct sk_buff *) list) {
1243 if (after(next->seq, ack_seq))
1244 break;
1245 if (after(next->end_seq, ack_seq))
1246 ack_seq = tcp_queue_ack(next, sk);
1247 next = next->next;
1248 }
1249
1250 /*
1251 * Ok, we found new data, update acked_seq as
1252 * necessary (and possibly send the actual
1253 * ACK packet).
1254 */
1255 sk->acked_seq = ack_seq;
1256
1257 /*
1258 * rules for delaying an ack:
1259 * - delay time <= 0.5 HZ
1260 * - must send at least every 2 full sized packets
1261 * - we don't have a window update to send
1262 *
1263 * We handle the window update in the actual read
1264 * side, so we only have to worry about the first two.
1265 */
1266 if (!sk->delay_acks || th->fin) {
1267 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1268 }
1269 else
1270 {
1271 int timeout = sk->ato;
1272 if (timeout > HZ/2)
1273 timeout = HZ/2;
1274 if (sk->bytes_rcv > sk->max_unacked) {
1275 timeout = 0;
1276 mark_bh(TIMER_BH);
1277 }
1278 sk->ack_backlog++;
1279 if(sk->debug)
1280 printk("Ack queued.\n");
1281 tcp_reset_xmit_timer(sk, TIME_WRITE, timeout);
1282 }
1283 }
1284 }
1285
1286
1287 /*
1288 * This routine handles the data. If there is room in the buffer,
1289 * it will be have already been moved into it. If there is no
1290 * room, then we will just have to discard the packet.
1291 */
1292
1293 static int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1294 unsigned long saddr, unsigned short len)
1295 {
1296 struct tcphdr *th;
1297 u32 new_seq, shut_seq;
1298
1299 th = skb->h.th;
1300 skb_pull(skb,th->doff*4);
1301 skb_trim(skb,len-(th->doff*4));
1302
1303 /*
1304 * The bytes in the receive read/assembly queue has increased. Needed for the
1305 * low memory discard algorithm
1306 */
1307
1308 sk->bytes_rcv += skb->len;
1309
1310 if (skb->len == 0 && !th->fin)
1311 {
1312 /*
1313 * Don't want to keep passing ack's back and forth.
1314 * (someone sent us dataless, boring frame)
1315 */
1316 if (!th->ack)
1317 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1318 kfree_skb(skb, FREE_READ);
1319 return(0);
1320 }
1321
1322 /*
1323 * We no longer have anyone receiving data on this connection.
1324 */
1325
1326 #ifndef TCP_DONT_RST_SHUTDOWN
1327
1328 if(sk->shutdown & RCV_SHUTDOWN)
1329 {
1330 /*
1331 * FIXME: BSD has some magic to avoid sending resets to
1332 * broken 4.2 BSD keepalives. Much to my surprise a few non
1333 * BSD stacks still have broken keepalives so we want to
1334 * cope with it.
1335 */
1336
1337 if(skb->len) /* We don't care if it's just an ack or
1338 a keepalive/window probe */
1339 {
1340 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
1341
1342 /* Do this the way 4.4BSD treats it. Not what I'd
1343 regard as the meaning of the spec but it's what BSD
1344 does and clearly they know everything 8) */
1345
1346 /*
1347 * This is valid because of two things
1348 *
1349 * a) The way tcp_data behaves at the bottom.
1350 * b) A fin takes effect when read not when received.
1351 */
1352
1353 shut_seq = sk->acked_seq+1; /* Last byte */
1354
1355 if(after(new_seq,shut_seq))
1356 {
1357 if(sk->debug)
1358 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1359 sk, new_seq, shut_seq, sk->blog);
1360 if(sk->dead)
1361 {
1362 sk->acked_seq = new_seq + th->fin;
1363 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1364 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1365 tcp_statistics.TcpEstabResets++;
1366 sk->err = EPIPE;
1367 sk->error_report(sk);
1368 sk->shutdown = SHUTDOWN_MASK;
1369 tcp_set_state(sk,TCP_CLOSE);
1370 kfree_skb(skb, FREE_READ);
1371 return 0;
1372 }
1373 }
1374 }
1375 }
1376
1377 #endif
1378
1379 tcp_queue(skb, sk, th, saddr);
1380
1381 /*
1382 * If we've missed a packet, send an ack.
1383 * Also start a timer to send another.
1384 */
1385
1386 if (!skb->acked)
1387 {
1388
1389 /*
1390 * This is important. If we don't have much room left,
1391 * we need to throw out a few packets so we have a good
1392 * window. Note that mtu is used, not mss, because mss is really
1393 * for the send side. He could be sending us stuff as large as mtu.
1394 */
1395
1396 while (sock_rspace(sk) < sk->mtu)
1397 {
1398 struct sk_buff * skb1 = skb_peek(&sk->receive_queue);
1399 if (skb1 == NULL)
1400 {
1401 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1402 break;
1403 }
1404
1405 /*
1406 * Don't throw out something that has been acked.
1407 */
1408
1409 if (skb1->acked)
1410 {
1411 break;
1412 }
1413
1414 skb_unlink(skb1);
1415 kfree_skb(skb1, FREE_READ);
1416 }
1417 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1418 sk->ack_backlog++;
1419 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1420 }
1421
1422 /*
1423 * Now tell the user we may have some data.
1424 */
1425
1426 if (!sk->dead)
1427 {
1428 if(sk->debug)
1429 printk("Data wakeup.\n");
1430 sk->data_ready(sk,0);
1431 }
1432 return(0);
1433 }
1434
1435
1436 /*
1437 * This routine is only called when we have urgent data
1438 * signalled. Its the 'slow' part of tcp_urg. It could be
1439 * moved inline now as tcp_urg is only called from one
1440 * place. We handle URGent data wrong. We have to - as
1441 * BSD still doesn't use the correction from RFC961.
1442 */
1443
1444 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1445 {
1446 u32 ptr = ntohs(th->urg_ptr);
1447
1448 if (ptr)
1449 ptr--;
1450 ptr += ntohl(th->seq);
1451
1452 /* ignore urgent data that we've already seen and read */
1453 if (after(sk->copied_seq, ptr))
1454 return;
1455
1456 /* do we already have a newer (or duplicate) urgent pointer? */
1457 if (sk->urg_data && !after(ptr, sk->urg_seq))
1458 return;
1459
1460 /* tell the world about our new urgent pointer */
1461 if (sk->proc != 0) {
1462 if (sk->proc > 0) {
1463 kill_proc(sk->proc, SIGURG, 1);
1464 } else {
1465 kill_pg(-sk->proc, SIGURG, 1);
1466 }
1467 }
1468 sk->urg_data = URG_NOTYET;
1469 sk->urg_seq = ptr;
1470 }
1471
1472 /*
1473 * This is the 'fast' part of urgent handling.
1474 */
1475
1476 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1477 {
1478 /*
1479 * Check if we get a new urgent pointer - normally not
1480 */
1481
1482 if (th->urg)
1483 tcp_check_urg(sk,th);
1484
1485 /*
1486 * Do we wait for any urgent data? - normally not
1487 */
1488
1489 if (sk->urg_data == URG_NOTYET) {
1490 u32 ptr;
1491
1492 /*
1493 * Is the urgent pointer pointing into this packet?
1494 */
1495 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1496 if (ptr < len) {
1497 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1498 if (!sk->dead)
1499 sk->data_ready(sk,0);
1500 }
1501 }
1502 }
1503
1504
1505 /*
1506 * A TCP packet has arrived.
1507 * skb->h.raw is the TCP header.
1508 */
1509
1510 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1511 __u32 daddr, unsigned short len,
1512 __u32 saddr, int redo, struct inet_protocol * protocol)
1513 {
1514 struct tcphdr *th;
1515 struct sock *sk;
1516 int syn_ok=0;
1517
1518 /*
1519 * "redo" is 1 if we have already seen this skb but couldn't
1520 * use it at that time (the socket was locked). In that case
1521 * we have already done a lot of the work (looked up the socket
1522 * etc).
1523 */
1524 th = skb->h.th;
1525 sk = skb->sk;
1526 if (!redo) {
1527 tcp_statistics.TcpInSegs++;
1528 if (skb->pkt_type!=PACKET_HOST)
1529 goto discard_it;
1530
1531 /*
1532 * Pull up the IP header.
1533 */
1534
1535 skb_pull(skb, skb->h.raw-skb->data);
1536
1537 /*
1538 * Try to use the device checksum if provided.
1539 */
1540 switch (skb->ip_summed)
1541 {
1542 case CHECKSUM_NONE:
1543 skb->csum = csum_partial((char *)th, len, 0);
1544 case CHECKSUM_HW:
1545 if (tcp_check(th, len, saddr, daddr, skb->csum))
1546 goto discard_it;
1547 default:
1548 /* CHECKSUM_UNNECESSARY */
1549 }
1550 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1551 if (!sk)
1552 goto no_tcp_socket;
1553 skb->sk = sk;
1554 skb->seq = ntohl(th->seq);
1555 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1556 skb->ack_seq = ntohl(th->ack_seq);
1557
1558 skb->acked = 0;
1559 skb->used = 0;
1560 skb->free = 1;
1561 skb->saddr = daddr;
1562 skb->daddr = saddr;
1563
1564 /*
1565 * We may need to add it to the backlog here.
1566 */
1567 if (sk->users)
1568 {
1569 __skb_queue_tail(&sk->back_log, skb);
1570 return(0);
1571 }
1572 }
1573
1574 /*
1575 * If this socket has got a reset it's to all intents and purposes
1576 * really dead. Count closed sockets as dead.
1577 *
1578 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1579 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
1580 * exist so should cause resets as if the port was unreachable.
1581 */
1582
1583 if (sk->zapped || sk->state==TCP_CLOSE)
1584 goto no_tcp_socket;
1585
1586 if (!sk->prot)
1587 {
1588 printk("IMPOSSIBLE 3\n");
1589 return(0);
1590 }
1591
1592
1593 /*
1594 * Charge the memory to the socket.
1595 */
1596
1597 skb->sk=sk;
1598 atomic_add(skb->truesize, &sk->rmem_alloc);
1599
1600 /*
1601 * We should now do header prediction.
1602 */
1603
1604 /*
1605 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1606 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1607 * compatibility. We also set up variables more thoroughly [Karn notes in the
1608 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1609 */
1610
1611 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
1612 {
1613
1614 /*
1615 * Now deal with unusual cases.
1616 */
1617
1618 if(sk->state==TCP_LISTEN)
1619 {
1620 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
1621 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1622
1623 /*
1624 * We don't care for RST, and non SYN are absorbed (old segments)
1625 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1626 * netmask on a running connection it can go broadcast. Even Sun's have
1627 * this problem so I'm ignoring it
1628 */
1629
1630 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1631 {
1632 kfree_skb(skb, FREE_READ);
1633 return 0;
1634 }
1635
1636 /*
1637 * Guess we need to make a new socket up
1638 */
1639
1640 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1641
1642 /*
1643 * Now we have several options: In theory there is nothing else
1644 * in the frame. KA9Q has an option to send data with the syn,
1645 * BSD accepts data with the syn up to the [to be] advertised window
1646 * and Solaris 2.1 gives you a protocol error. For now we just ignore
1647 * it, that fits the spec precisely and avoids incompatibilities. It
1648 * would be nice in future to drop through and process the data.
1649 *
1650 * Now TTCP is starting to use we ought to queue this data.
1651 */
1652
1653 return 0;
1654 }
1655
1656 /*
1657 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1658 * then its a new connection
1659 */
1660
1661 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1662 {
1663 kfree_skb(skb, FREE_READ);
1664 return 0;
1665 }
1666
1667 /*
1668 * SYN sent means we have to look for a suitable ack and either reset
1669 * for bad matches or go to connected. The SYN_SENT case is unusual and should
1670 * not be in line code. [AC]
1671 */
1672
1673 if(sk->state==TCP_SYN_SENT)
1674 {
1675 /* Crossed SYN or previous junk segment */
1676 if(th->ack)
1677 {
1678 /* We got an ack, but it's not a good ack */
1679 if(!tcp_ack(sk,th,skb->ack_seq,len))
1680 {
1681 /* Reset the ack - its an ack from a
1682 different connection [ th->rst is checked in tcp_send_reset()] */
1683 tcp_statistics.TcpAttemptFails++;
1684 tcp_send_reset(daddr, saddr, th,
1685 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1686 kfree_skb(skb, FREE_READ);
1687 return(0);
1688 }
1689 if(th->rst)
1690 return tcp_reset(sk,skb);
1691 if(!th->syn)
1692 {
1693 /* A valid ack from a different connection
1694 start. Shouldn't happen but cover it */
1695 tcp_statistics.TcpAttemptFails++;
1696 tcp_send_reset(daddr, saddr, th,
1697 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1698 kfree_skb(skb, FREE_READ);
1699 return 0;
1700 }
1701 /*
1702 * Ok.. it's good. Set up sequence numbers and
1703 * move to established.
1704 */
1705 syn_ok=1; /* Don't reset this connection for the syn */
1706 sk->acked_seq = skb->seq+1;
1707 sk->lastwin_seq = skb->seq+1;
1708 sk->fin_seq = skb->seq;
1709 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1710 tcp_set_state(sk, TCP_ESTABLISHED);
1711 tcp_options(sk,th);
1712 sk->dummy_th.dest=th->source;
1713 sk->copied_seq = sk->acked_seq;
1714 if(!sk->dead)
1715 {
1716 sk->state_change(sk);
1717 sock_wake_async(sk->socket, 0);
1718 }
1719 if(sk->max_window==0)
1720 {
1721 sk->max_window = 32;
1722 sk->mss = min(sk->max_window, sk->mtu);
1723 }
1724 }
1725 else
1726 {
1727 /* See if SYN's cross. Drop if boring */
1728 if(th->syn && !th->rst)
1729 {
1730 /* Crossed SYN's are fine - but talking to
1731 yourself is right out... */
1732 if(sk->saddr==saddr && sk->daddr==daddr &&
1733 sk->dummy_th.source==th->source &&
1734 sk->dummy_th.dest==th->dest)
1735 {
1736 tcp_statistics.TcpAttemptFails++;
1737 return tcp_reset(sk,skb);
1738 }
1739 tcp_set_state(sk,TCP_SYN_RECV);
1740
1741 /*
1742 * FIXME:
1743 * Must send SYN|ACK here
1744 */
1745 }
1746 /* Discard junk segment */
1747 kfree_skb(skb, FREE_READ);
1748 return 0;
1749 }
1750 /*
1751 * SYN_RECV with data maybe.. drop through
1752 */
1753 goto rfc_step6;
1754 }
1755
1756 /*
1757 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1758 * a more complex suggestion for fixing these reuse issues in RFC1644
1759 * but not yet ready for general use. Also see RFC1379.
1760 *
1761 * Note the funny way we go back to the top of this function for
1762 * this case ("goto try_next_socket"). That also takes care of
1763 * checking "sk->users" for the new socket as well as doing all
1764 * the normal tests on the packet.
1765 */
1766
1767 #define BSD_TIME_WAIT
1768 #ifdef BSD_TIME_WAIT
1769 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1770 after(skb->seq, sk->acked_seq) && !th->rst)
1771 {
1772 u32 seq = sk->write_seq;
1773 if(sk->debug)
1774 printk("Doing a BSD time wait\n");
1775 tcp_statistics.TcpEstabResets++;
1776 atomic_sub(skb->truesize, &sk->rmem_alloc);
1777 skb->sk = NULL;
1778 sk->err=ECONNRESET;
1779 tcp_set_state(sk, TCP_CLOSE);
1780 sk->shutdown = SHUTDOWN_MASK;
1781 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1782 /* this is not really correct: we should check sk->users */
1783 if (sk && sk->state==TCP_LISTEN)
1784 {
1785 skb->sk = sk;
1786 atomic_add(skb->truesize, &sk->rmem_alloc);
1787 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1788 return 0;
1789 }
1790 kfree_skb(skb, FREE_READ);
1791 return 0;
1792 }
1793 #endif
1794 }
1795
1796 /*
1797 * We are now in normal data flow (see the step list in the RFC)
1798 * Note most of these are inline now. I'll inline the lot when
1799 * I have time to test it hard and look at what gcc outputs
1800 */
1801
1802 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1803 {
1804 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1805 kfree_skb(skb, FREE_READ);
1806 return 0;
1807 }
1808
1809 if(th->rst)
1810 return tcp_reset(sk,skb);
1811
1812 /*
1813 * !syn_ok is effectively the state test in RFC793.
1814 */
1815
1816 if(th->syn && !syn_ok)
1817 {
1818 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1819 return tcp_reset(sk,skb);
1820 }
1821
1822 tcp_delack_estimator(sk);
1823
1824 /*
1825 * Process the ACK
1826 */
1827
1828
1829 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1830 {
1831 /*
1832 * Our three way handshake failed.
1833 */
1834
1835 if(sk->state==TCP_SYN_RECV)
1836 {
1837 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1838 }
1839 kfree_skb(skb, FREE_READ);
1840 return 0;
1841 }
1842
1843 rfc_step6: /* I'll clean this up later */
1844
1845 /*
1846 * If the accepted buffer put us over our queue size we
1847 * now drop it (we must process the ack first to avoid
1848 * deadlock cases).
1849 */
1850 #if 0
1851 /*
1852 * Is this test really a good idea? We should
1853 * throw away packets that aren't in order, not
1854 * new packets.
1855 */
1856 if (sk->rmem_alloc >= sk->rcvbuf)
1857 {
1858 kfree_skb(skb, FREE_READ);
1859 return(0);
1860 }
1861 #endif
1862
1863
1864 /*
1865 * Process urgent data
1866 */
1867
1868 tcp_urg(sk, th, len);
1869
1870 /*
1871 * Process the encapsulated data
1872 */
1873
1874 if(tcp_data(skb,sk, saddr, len))
1875 kfree_skb(skb, FREE_READ);
1876
1877 /*
1878 * And done
1879 */
1880
1881 return 0;
1882
1883 no_tcp_socket:
1884 /*
1885 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1886 */
1887 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1888
1889 discard_it:
1890 /*
1891 * Discard frame
1892 */
1893 skb->sk = NULL;
1894 kfree_skb(skb, FREE_READ);
1895 return 0;
1896 }