1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp_input.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * FIXES
23 * Pedro Roque : Double ACK bug
24 */
25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /*
30 * Policy code extracted so its now seperate
31 */
32
33 /*
34 * Called each time to estimate the delayed ack timeout. This is
35 * how it should be done so a fast link isnt impacted by ack delay.
36 */
37
38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
39 {
40 /*
41 * Delayed ACK time estimator.
42 */
43
44 if (sk->lrcvtime == 0)
45 {
46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 }
49 else
50 {
51 int m;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 {
62 sk->ato = sk->rtt >> 3;
63 /*
64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
65 */
66 }
67 else
68 {
69 sk->ato = (sk->ato >> 1) + m;
70 /*
71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
72 */
73 }
74 }
75 }
76
77 /*
78 * Called on frames that were known _not_ to have been
79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87].
80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
81 */
82
83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
84 {
85 long m;
86 /*
87 * The following amusing code comes from Jacobson's
88 * article in SIGCOMM '88. Note that rtt and mdev
89 * are scaled versions of rtt and mean deviation.
90 * This is designed to be as fast as possible
91 * m stands for "measurement".
92 */
93
94 m = jiffies - oskb->when; /* RTT */
95 if(m<=0)
96 m=1; /* IS THIS RIGHT FOR <0 ??? */
97 m -= (sk->rtt >> 3); /* m is now error in rtt est */
98 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
99 if (m < 0)
100 m = -m; /* m is now abs(error) */
101 m -= (sk->mdev >> 2); /* similar update on mdev */
102 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
103
104 /*
105 * Now update timeout. Note that this removes any backoff.
106 */
107
108 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
109 if (sk->rto > 120*HZ)
110 sk->rto = 120*HZ;
111 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
112 sk->rto = HZ/5;
113 sk->backoff = 0;
114 }
115
116 /*
117 * Cached last hit socket
118 */
119
120 static volatile unsigned long th_cache_saddr, th_cache_daddr;
121 static volatile unsigned short th_cache_dport, th_cache_sport;
122 static volatile struct sock *th_cache_sk;
123
124 void tcp_cache_zap(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
125 {
126 th_cache_sk=NULL;
127 }
128
129 /*
130 * Find the socket, using the last hit cache if applicable. The cache is not quite
131 * right...
132 */
133
134 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
135 {
136 struct sock * sk;
137
138 sk = (struct sock *) th_cache_sk;
139 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
140 sport != th_cache_sport || dport != th_cache_dport) {
141 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
142 if (sk) {
143 th_cache_saddr=saddr;
144 th_cache_daddr=daddr;
145 th_cache_dport=dport;
146 th_cache_sport=sport;
147 th_cache_sk=sk;
148 }
149 }
150 return sk;
151 }
152
153 /*
154 * React to a out-of-window TCP sequence number in an incoming packet
155 */
156
157 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
158 struct options *opt, unsigned long saddr, struct device *dev)
159 {
160 if (th->rst)
161 return;
162
163 /*
164 * Send a reset if we get something not ours and we are
165 * unsynchronized. Note: We don't do anything to our end. We
166 * are just killing the bogus remote connection then we will
167 * connect again and it will work (with luck).
168 */
169
170 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
171 {
172 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
173 return;
174 }
175
176 /*
177 * 4.3reno machines look for these kind of acks so they can do fast
178 * recovery. Three identical 'old' acks lets it know that one frame has
179 * been lost and should be resent. Because this is before the whole window
180 * of data has timed out it can take one lost frame per window without
181 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
182 *
183 * We also should be spotting triple bad sequences.
184 */
185 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
186 return;
187 }
188
189 /*
190 * This functions checks to see if the tcp header is actually acceptable.
191 */
192
193 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
194 {
195 u32 end_window = sk->acked_seq + sk->window;
196 return /* if start is at end of window, end must be too (zero window) */
197 (seq == end_window && seq == end_seq) ||
198 /* if start is before end of window, check for interest */
199 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
200 }
201
202 /*
203 * When we get a reset we do this. This probably is a tcp_output routine
204 * really.
205 */
206
207 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
208 {
209 sk->zapped = 1;
210 /*
211 * We want the right error as BSD sees it (and indeed as we do).
212 */
213 sk->err = ECONNRESET;
214 if (sk->state == TCP_SYN_SENT)
215 sk->err = ECONNREFUSED;
216 if (sk->state == TCP_CLOSE_WAIT)
217 sk->err = EPIPE;
218 #ifdef CONFIG_TCP_RFC1337
219 /*
220 * Time wait assassination protection [RFC1337]
221 *
222 * This is a good idea, but causes more sockets to take time to close.
223 *
224 * Ian Heavens has since shown this is an inadequate fix for the protocol
225 * bug in question.
226 */
227 if(sk->state!=TCP_TIME_WAIT)
228 {
229 tcp_set_state(sk,TCP_CLOSE);
230 sk->shutdown = SHUTDOWN_MASK;
231 }
232 #else
233 tcp_set_state(sk,TCP_CLOSE);
234 sk->shutdown = SHUTDOWN_MASK;
235 #endif
236 if (!sk->dead)
237 sk->state_change(sk);
238 kfree_skb(skb, FREE_READ);
239 return(0);
240 }
241
242
243 /*
244 * Look for tcp options. Parses everything but only knows about MSS.
245 * This routine is always called with the packet containing the SYN.
246 * However it may also be called with the ack to the SYN. So you
247 * can't assume this is always the SYN. It's always called after
248 * we have set up sk->mtu to our own MTU.
249 *
250 * We need at minimum to add PAWS support here. Possibly large windows
251 * as Linux gets deployed on 100Mb/sec networks.
252 */
253
254 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
255 {
256 unsigned char *ptr;
257 int length=(th->doff*4)-sizeof(struct tcphdr);
258 int mss_seen = 0;
259
260 ptr = (unsigned char *)(th + 1);
261
262 while(length>0)
263 {
264 int opcode=*ptr++;
265 int opsize=*ptr++;
266 switch(opcode)
267 {
268 case TCPOPT_EOL:
269 return;
270 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
271 length--;
272 ptr--; /* the opsize=*ptr++ above was a mistake */
273 continue;
274
275 default:
276 if(opsize<=2) /* Avoid silly options looping forever */
277 return;
278 switch(opcode)
279 {
280 case TCPOPT_MSS:
281 if(opsize==4 && th->syn)
282 {
283 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
284 mss_seen = 1;
285 }
286 break;
287 /* Add other options here as people feel the urge to implement stuff like large windows */
288 }
289 ptr+=opsize-2;
290 length-=opsize;
291 }
292 }
293 if (th->syn)
294 {
295 if (! mss_seen)
296 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
297 }
298 #ifdef CONFIG_INET_PCTCP
299 sk->mss = min(sk->max_window >> 1, sk->mtu);
300 #else
301 sk->mss = min(sk->max_window, sk->mtu);
302 sk->max_unacked = 2 * sk->mss;
303 #endif
304 }
305
306
307 /*
308 * This routine handles a connection request.
309 * It should make sure we haven't already responded.
310 * Because of the way BSD works, we have to send a syn/ack now.
311 * This also means it will be harder to close a socket which is
312 * listening.
313 */
314
315 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
316 u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
317 {
318 struct sock *newsk;
319 struct tcphdr *th;
320 struct rtable *rt;
321
322 th = skb->h.th;
323
324 /* If the socket is dead, don't accept the connection. */
325 if (!sk->dead)
326 {
327 sk->data_ready(sk,0);
328 }
329 else
330 {
331 if(sk->debug)
332 printk("Reset on %p: Connect on dead socket.\n",sk);
333 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
334 tcp_statistics.TcpAttemptFails++;
335 kfree_skb(skb, FREE_READ);
336 return;
337 }
338
339 /*
340 * Make sure we can accept more. This will prevent a
341 * flurry of syns from eating up all our memory.
342 *
343 * BSD does some funnies here and allows 3/2 times the
344 * set backlog as a fudge factor. Thats just too gross.
345 */
346
347 if (sk->ack_backlog >= sk->max_ack_backlog)
348 {
349 tcp_statistics.TcpAttemptFails++;
350 kfree_skb(skb, FREE_READ);
351 return;
352 }
353
354 /*
355 * We need to build a new sock struct.
356 * It is sort of bad to have a socket without an inode attached
357 * to it, but the wake_up's will just wake up the listening socket,
358 * and if the listening socket is destroyed before this is taken
359 * off of the queue, this will take care of it.
360 */
361
362 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
363 if (newsk == NULL)
364 {
365 /* just ignore the syn. It will get retransmitted. */
366 tcp_statistics.TcpAttemptFails++;
367 kfree_skb(skb, FREE_READ);
368 return;
369 }
370
371 memcpy(newsk, sk, sizeof(*newsk));
372 newsk->opt = NULL;
373 newsk->ip_route_cache = NULL;
374 if (opt && opt->optlen)
375 {
376 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
377 if (!sk->opt)
378 {
379 kfree_s(newsk, sizeof(struct sock));
380 tcp_statistics.TcpAttemptFails++;
381 kfree_skb(skb, FREE_READ);
382 return;
383 }
384 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
385 {
386 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
387 kfree_s(newsk, sizeof(struct sock));
388 tcp_statistics.TcpAttemptFails++;
389 kfree_skb(skb, FREE_READ);
390 return;
391 }
392 }
393 skb_queue_head_init(&newsk->write_queue);
394 skb_queue_head_init(&newsk->receive_queue);
395 newsk->send_head = NULL;
396 newsk->send_tail = NULL;
397 skb_queue_head_init(&newsk->back_log);
398 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
399 newsk->rto = TCP_TIMEOUT_INIT;
400 newsk->mdev = 0;
401 newsk->max_window = 0;
402 newsk->cong_window = 1;
403 newsk->cong_count = 0;
404 newsk->ssthresh = 0;
405 newsk->backoff = 0;
406 newsk->blog = 0;
407 newsk->intr = 0;
408 newsk->proc = 0;
409 newsk->done = 0;
410 newsk->partial = NULL;
411 newsk->pair = NULL;
412 newsk->wmem_alloc = 0;
413 newsk->rmem_alloc = 0;
414 newsk->localroute = sk->localroute;
415
416 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
417
418 newsk->err = 0;
419 newsk->shutdown = 0;
420 newsk->ack_backlog = 0;
421 newsk->acked_seq = skb->seq+1;
422 newsk->lastwin_seq = skb->seq+1;
423 newsk->delay_acks = 1;
424 newsk->copied_seq = skb->seq+1;
425 newsk->fin_seq = skb->seq;
426 newsk->state = TCP_SYN_RECV;
427 newsk->timeout = 0;
428 newsk->ip_xmit_timeout = 0;
429 newsk->write_seq = seq;
430 newsk->window_seq = newsk->write_seq;
431 newsk->rcv_ack_seq = newsk->write_seq;
432 newsk->urg_data = 0;
433 newsk->retransmits = 0;
434 newsk->linger=0;
435 newsk->destroy = 0;
436 init_timer(&newsk->timer);
437 newsk->timer.data = (unsigned long)newsk;
438 newsk->timer.function = &net_timer;
439 init_timer(&newsk->retransmit_timer);
440 newsk->retransmit_timer.data = (unsigned long)newsk;
441 newsk->retransmit_timer.function=&tcp_retransmit_timer;
442 newsk->dummy_th.source = skb->h.th->dest;
443 newsk->dummy_th.dest = skb->h.th->source;
444
445 /*
446 * Swap these two, they are from our point of view.
447 */
448
449 newsk->daddr = saddr;
450 newsk->saddr = daddr;
451 newsk->rcv_saddr = daddr;
452
453 put_sock(newsk->num,newsk);
454 newsk->acked_seq = skb->seq + 1;
455 newsk->copied_seq = skb->seq + 1;
456 newsk->socket = NULL;
457
458 /*
459 * Grab the ttl and tos values and use them
460 */
461
462 newsk->ip_ttl=sk->ip_ttl;
463 newsk->ip_tos=skb->ip_hdr->tos;
464
465 /*
466 * Use 512 or whatever user asked for
467 */
468
469 /*
470 * Note use of sk->user_mss, since user has no direct access to newsk
471 */
472
473 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
474 newsk->ip_route_cache = rt;
475
476 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
477 newsk->window_clamp = rt->rt_window;
478 else
479 newsk->window_clamp = 0;
480
481 if (sk->user_mss)
482 newsk->mtu = sk->user_mss;
483 else if (rt)
484 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
485 else
486 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
487
488 /*
489 * But not bigger than device MTU
490 */
491
492 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
493
494 #ifdef CONFIG_SKIP
495
496 /*
497 * SKIP devices set their MTU to 65535. This is so they can take packets
498 * unfragmented to security process then fragment. They could lie to the
499 * TCP layer about a suitable MTU, but its easier to let skip sort it out
500 * simply because the final package we want unfragmented is going to be
501 *
502 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
503 */
504
505 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
506 sk->mtu=skip_pick_mtu(sk->mtu,dev);
507 #endif
508 /*
509 * This will min with what arrived in the packet
510 */
511
512 tcp_options(newsk,skb->h.th);
513
514 tcp_cache_zap();
515 tcp_send_synack(newsk, sk, skb);
516 }
517
518
519 /*
520 * Handle a TCP window that shrunk on us. It shouldn't happen,
521 * but..
522 *
523 * We may need to move packets from the send queue
524 * to the write queue, if the window has been shrunk on us.
525 * The RFC says you are not allowed to shrink your window
526 * like this, but if the other end does, you must be able
527 * to deal with it.
528 */
529 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
530 {
531 struct sk_buff *skb;
532 struct sk_buff *skb2;
533 struct sk_buff *wskb = NULL;
534
535 skb2 = sk->send_head;
536 sk->send_head = NULL;
537 sk->send_tail = NULL;
538
539 /*
540 * This is an artifact of a flawed concept. We want one
541 * queue and a smarter send routine when we send all.
542 */
543 cli();
544 while (skb2 != NULL)
545 {
546 skb = skb2;
547 skb2 = skb->link3;
548 skb->link3 = NULL;
549 if (after(skb->end_seq, window_seq))
550 {
551 if (sk->packets_out > 0)
552 sk->packets_out--;
553 /* We may need to remove this from the dev send list. */
554 if (skb->next != NULL)
555 {
556 skb_unlink(skb);
557 }
558 /* Now add it to the write_queue. */
559 if (wskb == NULL)
560 skb_queue_head(&sk->write_queue,skb);
561 else
562 skb_append(wskb,skb);
563 wskb = skb;
564 }
565 else
566 {
567 if (sk->send_head == NULL)
568 {
569 sk->send_head = skb;
570 sk->send_tail = skb;
571 }
572 else
573 {
574 sk->send_tail->link3 = skb;
575 sk->send_tail = skb;
576 }
577 skb->link3 = NULL;
578 }
579 }
580 sti();
581 }
582
583
584 /*
585 * This routine deals with incoming acks, but not outgoing ones.
586 *
587 * This routine is totally _WRONG_. The list structuring is wrong,
588 * the algorithm is wrong, the code is wrong.
589 */
590
591 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
592 {
593 int flag = 0;
594 u32 window_seq;
595
596 /*
597 * 1 - there was data in packet as well as ack or new data is sent or
598 * in shutdown state
599 * 2 - data from retransmit queue was acked and removed
600 * 4 - window shrunk or data from retransmit queue was acked and removed
601 */
602
603 if(sk->zapped)
604 return(1); /* Dead, cant ack any more so why bother */
605
606 /*
607 * We have dropped back to keepalive timeouts. Thus we have
608 * no retransmits pending.
609 */
610
611 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
612 sk->retransmits = 0;
613
614 /*
615 * If the ack is newer than sent or older than previous acks
616 * then we can probably ignore it.
617 */
618
619 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
620 goto uninteresting_ack;
621
622 /*
623 * If there is data set flag 1
624 */
625
626 if (len != th->doff*4)
627 flag |= 1;
628
629 /*
630 * Have we discovered a larger window
631 */
632 window_seq = ntohs(th->window);
633 if (window_seq > sk->max_window)
634 {
635 sk->max_window = window_seq;
636 #ifdef CONFIG_INET_PCTCP
637 /* Hack because we don't send partial packets to non SWS
638 handling hosts */
639 sk->mss = min(window_seq>>1, sk->mtu);
640 #else
641 sk->mss = min(window_seq, sk->mtu);
642 #endif
643 }
644 window_seq += ack;
645
646 /*
647 * See if our window has been shrunk.
648 */
649 if (after(sk->window_seq, window_seq)) {
650 flag |= 4;
651 tcp_window_shrunk(sk, window_seq);
652 }
653
654 /*
655 * Update the right hand window edge of the host
656 */
657 sk->window_seq = window_seq;
658
659 /*
660 * Pipe has emptied
661 */
662 if (sk->send_tail == NULL || sk->send_head == NULL)
663 {
664 sk->send_head = NULL;
665 sk->send_tail = NULL;
666 sk->packets_out= 0;
667 }
668
669 /*
670 * We don't want too many packets out there.
671 */
672
673 if (sk->ip_xmit_timeout == TIME_WRITE &&
674 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
675 {
676
677 /*
678 * This is Jacobson's slow start and congestion avoidance.
679 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
680 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
681 * counter and increment it once every cwnd times. It's possible
682 * that this should be done only if sk->retransmits == 0. I'm
683 * interpreting "new data is acked" as including data that has
684 * been retransmitted but is just now being acked.
685 */
686 if (sk->cong_window < sk->ssthresh)
687 /*
688 * In "safe" area, increase
689 */
690 sk->cong_window++;
691 else
692 {
693 /*
694 * In dangerous area, increase slowly. In theory this is
695 * sk->cong_window += 1 / sk->cong_window
696 */
697 if (sk->cong_count >= sk->cong_window)
698 {
699 sk->cong_window++;
700 sk->cong_count = 0;
701 }
702 else
703 sk->cong_count++;
704 }
705 }
706
707 /*
708 * Remember the highest ack received.
709 */
710
711 sk->rcv_ack_seq = ack;
712
713 /*
714 * We passed data and got it acked, remove any soft error
715 * log. Something worked...
716 */
717
718 sk->err_soft = 0;
719
720 /*
721 * If this ack opens up a zero window, clear backoff. It was
722 * being used to time the probes, and is probably far higher than
723 * it needs to be for normal retransmission.
724 */
725
726 if (sk->ip_xmit_timeout == TIME_PROBE0)
727 {
728 sk->retransmits = 0; /* Our probe was answered */
729
730 /*
731 * Was it a usable window open ?
732 */
733
734 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
735 ! before (sk->window_seq, sk->write_queue.next->end_seq))
736 {
737 sk->backoff = 0;
738
739 /*
740 * Recompute rto from rtt. this eliminates any backoff.
741 */
742
743 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
744 if (sk->rto > 120*HZ)
745 sk->rto = 120*HZ;
746 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
747 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
748 .2 of a second is going to need huge windows (SIGH) */
749 sk->rto = HZ/5;
750 }
751 }
752
753 /*
754 * See if we can take anything off of the retransmit queue.
755 */
756
757 while(sk->send_head != NULL)
758 {
759 /* Check for a bug. */
760 if (sk->send_head->link3 &&
761 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
762 printk("INET: tcp.c: *** bug send_list out of order.\n");
763
764 /*
765 * If our packet is before the ack sequence we can
766 * discard it as it's confirmed to have arrived the other end.
767 */
768
769 if (before(sk->send_head->end_seq, ack+1))
770 {
771 struct sk_buff *oskb;
772 if (sk->retransmits)
773 {
774 /*
775 * We were retransmitting. don't count this in RTT est
776 */
777 flag |= 2;
778
779 /*
780 * even though we've gotten an ack, we're still
781 * retransmitting as long as we're sending from
782 * the retransmit queue. Keeping retransmits non-zero
783 * prevents us from getting new data interspersed with
784 * retransmissions.
785 */
786
787 if (sk->send_head->link3) /* Any more queued retransmits? */
788 sk->retransmits = 1;
789 else
790 sk->retransmits = 0;
791 }
792 /*
793 * Note that we only reset backoff and rto in the
794 * rtt recomputation code. And that doesn't happen
795 * if there were retransmissions in effect. So the
796 * first new packet after the retransmissions is
797 * sent with the backoff still in effect. Not until
798 * we get an ack from a non-retransmitted packet do
799 * we reset the backoff and rto. This allows us to deal
800 * with a situation where the network delay has increased
801 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
802 */
803
804 /*
805 * We have one less packet out there.
806 */
807
808 if (sk->packets_out > 0)
809 sk->packets_out --;
810
811 oskb = sk->send_head;
812
813 if (!(flag&2)) /* Not retransmitting */
814 tcp_rtt_estimator(sk,oskb);
815 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
816 In this case as we just set it up */
817 cli();
818 oskb = sk->send_head;
819 IS_SKB(oskb);
820 sk->send_head = oskb->link3;
821 if (sk->send_head == NULL)
822 {
823 sk->send_tail = NULL;
824 }
825
826 /*
827 * We may need to remove this from the dev send list.
828 */
829
830 if (oskb->next)
831 skb_unlink(oskb);
832 sti();
833 kfree_skb(oskb, FREE_WRITE); /* write. */
834 if (!sk->dead)
835 sk->write_space(sk);
836 }
837 else
838 {
839 break;
840 }
841 }
842
843 /*
844 * XXX someone ought to look at this too.. at the moment, if skb_peek()
845 * returns non-NULL, we complete ignore the timer stuff in the else
846 * clause. We ought to organize the code so that else clause can
847 * (should) be executed regardless, possibly moving the PROBE timer
848 * reset over. The skb_peek() thing should only move stuff to the
849 * write queue, NOT also manage the timer functions.
850 */
851
852 /*
853 * Maybe we can take some stuff off of the write queue,
854 * and put it onto the xmit queue.
855 */
856 if (skb_peek(&sk->write_queue) != NULL)
857 {
858 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
859 (sk->retransmits == 0 ||
860 sk->ip_xmit_timeout != TIME_WRITE ||
861 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
862 && sk->packets_out < sk->cong_window)
863 {
864 /*
865 * Add more data to the send queue.
866 */
867 flag |= 1;
868 tcp_write_xmit(sk);
869 }
870 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
871 sk->send_head == NULL &&
872 sk->ack_backlog == 0 &&
873 sk->state != TCP_TIME_WAIT)
874 {
875 /*
876 * Data to queue but no room.
877 */
878 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
879 }
880 }
881 else
882 {
883 /*
884 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
885 * from TCP_CLOSE we don't do anything
886 *
887 * from anything else, if there is write data (or fin) pending,
888 * we use a TIME_WRITE timeout, else if keepalive we reset to
889 * a KEEPALIVE timeout, else we delete the timer.
890 *
891 * We do not set flag for nominal write data, otherwise we may
892 * force a state where we start to write itsy bitsy tidbits
893 * of data.
894 */
895
896 switch(sk->state) {
897 case TCP_TIME_WAIT:
898 /*
899 * keep us in TIME_WAIT until we stop getting packets,
900 * reset the timeout.
901 */
902 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
903 break;
904 case TCP_CLOSE:
905 /*
906 * don't touch the timer.
907 */
908 break;
909 default:
910 /*
911 * Must check send_head, write_queue, and ack_backlog
912 * to determine which timeout to use.
913 */
914 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
915 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
916 } else if (sk->keepopen) {
917 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
918 } else {
919 del_timer(&sk->retransmit_timer);
920 sk->ip_xmit_timeout = 0;
921 }
922 break;
923 }
924 }
925
926 /*
927 * We have nothing queued but space to send. Send any partial
928 * packets immediately (end of Nagle rule application).
929 */
930
931 if (sk->packets_out == 0 && sk->partial != NULL &&
932 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
933 {
934 flag |= 1;
935 tcp_send_partial(sk);
936 }
937
938 /*
939 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
940 * we are now waiting for an acknowledge to our FIN. The other end is
941 * already in TIME_WAIT.
942 *
943 * Move to TCP_CLOSE on success.
944 */
945
946 if (sk->state == TCP_LAST_ACK)
947 {
948 if (!sk->dead)
949 sk->state_change(sk);
950 if(sk->debug)
951 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
952 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
953 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
954 {
955 flag |= 1;
956 sk->shutdown = SHUTDOWN_MASK;
957 tcp_set_state(sk,TCP_CLOSE);
958 return 1;
959 }
960 }
961
962 /*
963 * Incoming ACK to a FIN we sent in the case of our initiating the close.
964 *
965 * Move to FIN_WAIT2 to await a FIN from the other end. Set
966 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
967 */
968
969 if (sk->state == TCP_FIN_WAIT1)
970 {
971
972 if (!sk->dead)
973 sk->state_change(sk);
974 if (sk->rcv_ack_seq == sk->write_seq)
975 {
976 flag |= 1;
977 sk->shutdown |= SEND_SHUTDOWN;
978 tcp_set_state(sk, TCP_FIN_WAIT2);
979 }
980 }
981
982 /*
983 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
984 *
985 * Move to TIME_WAIT
986 */
987
988 if (sk->state == TCP_CLOSING)
989 {
990
991 if (!sk->dead)
992 sk->state_change(sk);
993 if (sk->rcv_ack_seq == sk->write_seq)
994 {
995 flag |= 1;
996 tcp_time_wait(sk);
997 }
998 }
999
1000 /*
1001 * Final ack of a three way shake
1002 */
1003
1004 if(sk->state==TCP_SYN_RECV)
1005 {
1006 tcp_set_state(sk, TCP_ESTABLISHED);
1007 tcp_options(sk,th);
1008 sk->dummy_th.dest=th->source;
1009 sk->copied_seq = sk->acked_seq;
1010 if(!sk->dead)
1011 sk->state_change(sk);
1012 if(sk->max_window==0)
1013 {
1014 sk->max_window=32; /* Sanity check */
1015 sk->mss=min(sk->max_window,sk->mtu);
1016 }
1017 }
1018
1019 /*
1020 * I make no guarantees about the first clause in the following
1021 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
1022 * what conditions "!flag" would be true. However I think the rest
1023 * of the conditions would prevent that from causing any
1024 * unnecessary retransmission.
1025 * Clearly if the first packet has expired it should be
1026 * retransmitted. The other alternative, "flag&2 && retransmits", is
1027 * harder to explain: You have to look carefully at how and when the
1028 * timer is set and with what timeout. The most recent transmission always
1029 * sets the timer. So in general if the most recent thing has timed
1030 * out, everything before it has as well. So we want to go ahead and
1031 * retransmit some more. If we didn't explicitly test for this
1032 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1033 * would not be true. If you look at the pattern of timing, you can
1034 * show that rto is increased fast enough that the next packet would
1035 * almost never be retransmitted immediately. Then you'd end up
1036 * waiting for a timeout to send each packet on the retransmission
1037 * queue. With my implementation of the Karn sampling algorithm,
1038 * the timeout would double each time. The net result is that it would
1039 * take a hideous amount of time to recover from a single dropped packet.
1040 * It's possible that there should also be a test for TIME_WRITE, but
1041 * I think as long as "send_head != NULL" and "retransmit" is on, we've
1042 * got to be in real retransmission mode.
1043 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
1044 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1045 * As long as no further losses occur, this seems reasonable.
1046 */
1047
1048 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1049 (((flag&2) && sk->retransmits) ||
1050 (sk->send_head->when + sk->rto < jiffies)))
1051 {
1052 if(sk->send_head->when + sk->rto < jiffies)
1053 tcp_retransmit(sk,0);
1054 else
1055 {
1056 tcp_do_retransmit(sk, 1);
1057 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1058 }
1059 }
1060
1061 return 1;
1062
1063 uninteresting_ack:
1064 if(sk->debug)
1065 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1066
1067 /*
1068 * Keepalive processing.
1069 */
1070
1071 if (after(ack, sk->sent_seq))
1072 {
1073 return 0;
1074 }
1075
1076 /*
1077 * Restart the keepalive timer.
1078 */
1079
1080 if (sk->keepopen)
1081 {
1082 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1083 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1084 }
1085 return 1;
1086 }
1087
1088
1089 /*
1090 * Process the FIN bit. This now behaves as it is supposed to work
1091 * and the FIN takes effect when it is validly part of sequence
1092 * space. Not before when we get holes.
1093 *
1094 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1095 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1096 * TIME-WAIT)
1097 *
1098 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1099 * close and we go into CLOSING (and later onto TIME-WAIT)
1100 *
1101 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1102 *
1103 */
1104
1105 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1106 {
1107 sk->fin_seq = skb->end_seq;
1108
1109 if (!sk->dead)
1110 {
1111 sk->state_change(sk);
1112 sock_wake_async(sk->socket, 1);
1113 }
1114
1115 switch(sk->state)
1116 {
1117 case TCP_SYN_RECV:
1118 case TCP_SYN_SENT:
1119 case TCP_ESTABLISHED:
1120 /*
1121 * move to CLOSE_WAIT, tcp_data() already handled
1122 * sending the ack.
1123 */
1124 tcp_set_state(sk,TCP_CLOSE_WAIT);
1125 if (th->rst)
1126 sk->shutdown = SHUTDOWN_MASK;
1127 break;
1128
1129 case TCP_CLOSE_WAIT:
1130 case TCP_CLOSING:
1131 /*
1132 * received a retransmission of the FIN, do
1133 * nothing.
1134 */
1135 break;
1136 case TCP_TIME_WAIT:
1137 /*
1138 * received a retransmission of the FIN,
1139 * restart the TIME_WAIT timer.
1140 */
1141 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1142 return(0);
1143 case TCP_FIN_WAIT1:
1144 /*
1145 * This case occurs when a simultaneous close
1146 * happens, we must ack the received FIN and
1147 * enter the CLOSING state.
1148 *
1149 * This causes a WRITE timeout, which will either
1150 * move on to TIME_WAIT when we timeout, or resend
1151 * the FIN properly (maybe we get rid of that annoying
1152 * FIN lost hang). The TIME_WRITE code is already correct
1153 * for handling this timeout.
1154 */
1155
1156 if(sk->ip_xmit_timeout != TIME_WRITE)
1157 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1158 tcp_set_state(sk,TCP_CLOSING);
1159 break;
1160 case TCP_FIN_WAIT2:
1161 /*
1162 * received a FIN -- send ACK and enter TIME_WAIT
1163 */
1164 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1165 sk->shutdown|=SHUTDOWN_MASK;
1166 tcp_set_state(sk,TCP_TIME_WAIT);
1167 break;
1168 case TCP_CLOSE:
1169 /*
1170 * already in CLOSE
1171 */
1172 break;
1173 default:
1174 tcp_set_state(sk,TCP_LAST_ACK);
1175
1176 /* Start the timers. */
1177 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1178 return(0);
1179 }
1180
1181 return(0);
1182 }
1183
1184
1185
1186 /*
1187 * This routine handles the data. If there is room in the buffer,
1188 * it will be have already been moved into it. If there is no
1189 * room, then we will just have to discard the packet.
1190 */
1191
1192 static int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1193 unsigned long saddr, unsigned short len)
1194 {
1195 struct sk_buff *skb1, *skb2;
1196 struct tcphdr *th;
1197 int dup_dumped=0;
1198 u32 new_seq, shut_seq;
1199
1200 th = skb->h.th;
1201 skb_pull(skb,th->doff*4);
1202 skb_trim(skb,len-(th->doff*4));
1203
1204 /*
1205 * The bytes in the receive read/assembly queue has increased. Needed for the
1206 * low memory discard algorithm
1207 */
1208
1209 sk->bytes_rcv += skb->len;
1210
1211 if (skb->len == 0 && !th->fin)
1212 {
1213 /*
1214 * Don't want to keep passing ack's back and forth.
1215 * (someone sent us dataless, boring frame)
1216 */
1217 if (!th->ack)
1218 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
1219 kfree_skb(skb, FREE_READ);
1220 return(0);
1221 }
1222
1223 /*
1224 * We no longer have anyone receiving data on this connection.
1225 */
1226
1227 #ifndef TCP_DONT_RST_SHUTDOWN
1228
1229 if(sk->shutdown & RCV_SHUTDOWN)
1230 {
1231 /*
1232 * FIXME: BSD has some magic to avoid sending resets to
1233 * broken 4.2 BSD keepalives. Much to my surprise a few non
1234 * BSD stacks still have broken keepalives so we want to
1235 * cope with it.
1236 */
1237
1238 if(skb->len) /* We don't care if it's just an ack or
1239 a keepalive/window probe */
1240 {
1241 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
1242
1243 /* Do this the way 4.4BSD treats it. Not what I'd
1244 regard as the meaning of the spec but it's what BSD
1245 does and clearly they know everything 8) */
1246
1247 /*
1248 * This is valid because of two things
1249 *
1250 * a) The way tcp_data behaves at the bottom.
1251 * b) A fin takes effect when read not when received.
1252 */
1253
1254 shut_seq = sk->acked_seq+1; /* Last byte */
1255
1256 if(after(new_seq,shut_seq))
1257 {
1258 if(sk->debug)
1259 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1260 sk, new_seq, shut_seq, sk->blog);
1261 if(sk->dead)
1262 {
1263 sk->acked_seq = new_seq + th->fin;
1264 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1265 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1266 tcp_statistics.TcpEstabResets++;
1267 sk->err = EPIPE;
1268 sk->error_report(sk);
1269 sk->shutdown = SHUTDOWN_MASK;
1270 tcp_set_state(sk,TCP_CLOSE);
1271 kfree_skb(skb, FREE_READ);
1272 return 0;
1273 }
1274 }
1275 }
1276 }
1277
1278 #endif
1279
1280 /*
1281 * Now we have to walk the chain, and figure out where this one
1282 * goes into it. This is set up so that the last packet we received
1283 * will be the first one we look at, that way if everything comes
1284 * in order, there will be no performance loss, and if they come
1285 * out of order we will be able to fit things in nicely.
1286 *
1287 * [AC: This is wrong. We should assume in order first and then walk
1288 * forwards from the first hole based upon real traffic patterns.]
1289 *
1290 */
1291
1292 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
1293 {
1294 skb_queue_head(&sk->receive_queue,skb);
1295 skb1= NULL;
1296 }
1297 else
1298 {
1299 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
1300 {
1301 if(sk->debug)
1302 {
1303 printk("skb1=%p :", skb1);
1304 printk("skb1->seq = %d: ", skb1->seq);
1305 printk("skb->seq = %d\n",skb->seq);
1306 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
1307 sk->acked_seq);
1308 }
1309
1310 /*
1311 * Optimisation: Duplicate frame or extension of previous frame from
1312 * same sequence point (lost ack case).
1313 * The frame contains duplicate data or replaces a previous frame
1314 * discard the previous frame (safe as sk->users is set) and put
1315 * the new one in its place.
1316 */
1317
1318 if (skb->seq==skb1->seq && skb->len>=skb1->len)
1319 {
1320 skb_append(skb1,skb);
1321 skb_unlink(skb1);
1322 kfree_skb(skb1,FREE_READ);
1323 dup_dumped=1;
1324 skb1=NULL;
1325 break;
1326 }
1327
1328 /*
1329 * Found where it fits
1330 */
1331
1332 if (after(skb->seq+1, skb1->seq))
1333 {
1334 skb_append(skb1,skb);
1335 break;
1336 }
1337
1338 /*
1339 * See if we've hit the start. If so insert.
1340 */
1341 if (skb1 == skb_peek(&sk->receive_queue))
1342 {
1343 skb_queue_head(&sk->receive_queue, skb);
1344 break;
1345 }
1346 }
1347 }
1348
1349 /*
1350 * Figure out what the ack value for this frame is
1351 */
1352
1353 if (before(sk->acked_seq, sk->copied_seq))
1354 {
1355 printk("*** tcp.c:tcp_data bug acked < copied\n");
1356 sk->acked_seq = sk->copied_seq;
1357 }
1358
1359 /*
1360 * Now figure out if we can ack anything. This is very messy because we really want two
1361 * receive queues, a completed and an assembly queue. We also want only one transmit
1362 * queue.
1363 */
1364
1365 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
1366 {
1367 if (before(skb->seq, sk->acked_seq+1))
1368 {
1369
1370 if (after(skb->end_seq, sk->acked_seq))
1371 sk->acked_seq = skb->end_seq;
1372
1373 skb->acked = 1;
1374
1375 /*
1376 * When we ack the fin, we do the FIN
1377 * processing.
1378 */
1379
1380 if (skb->h.th->fin)
1381 {
1382 tcp_fin(skb,sk,skb->h.th);
1383 }
1384
1385 for(skb2 = skb->next;
1386 skb2 != (struct sk_buff *)&sk->receive_queue;
1387 skb2 = skb2->next)
1388 {
1389 if (before(skb2->seq, sk->acked_seq+1))
1390 {
1391 if (after(skb2->end_seq, sk->acked_seq))
1392 sk->acked_seq = skb2->end_seq;
1393
1394 skb2->acked = 1;
1395 /*
1396 * When we ack the fin, we do
1397 * the fin handling.
1398 */
1399 if (skb2->h.th->fin)
1400 {
1401 tcp_fin(skb,sk,skb->h.th);
1402 }
1403
1404 /*
1405 * Force an immediate ack.
1406 */
1407
1408 sk->ack_backlog = sk->max_ack_backlog;
1409 }
1410 else
1411 {
1412 break;
1413 }
1414 }
1415
1416 /*
1417 * This also takes care of updating the window.
1418 * This if statement needs to be simplified.
1419 *
1420 * rules for delaying an ack:
1421 * - delay time <= 0.5 HZ
1422 * - we don't have a window update to send
1423 * - must send at least every 2 full sized packets
1424 */
1425 if (!sk->delay_acks ||
1426 /* sk->ack_backlog >= sk->max_ack_backlog || */
1427 sk->bytes_rcv > sk->max_unacked || th->fin ||
1428 sk->ato > HZ/2 ||
1429 tcp_raise_window(sk)) {
1430 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr);
1431 }
1432 else
1433 {
1434 sk->ack_backlog++;
1435
1436 if(sk->debug)
1437 printk("Ack queued.\n");
1438
1439 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->ato);
1440
1441 }
1442 }
1443 }
1444
1445 /*
1446 * If we've missed a packet, send an ack.
1447 * Also start a timer to send another.
1448 */
1449
1450 if (!skb->acked)
1451 {
1452
1453 /*
1454 * This is important. If we don't have much room left,
1455 * we need to throw out a few packets so we have a good
1456 * window. Note that mtu is used, not mss, because mss is really
1457 * for the send side. He could be sending us stuff as large as mtu.
1458 */
1459
1460 while (sock_rspace(sk) < sk->mtu)
1461 {
1462 skb1 = skb_peek(&sk->receive_queue);
1463 if (skb1 == NULL)
1464 {
1465 printk("INET: tcp.c:tcp_data memory leak detected.\n");
1466 break;
1467 }
1468
1469 /*
1470 * Don't throw out something that has been acked.
1471 */
1472
1473 if (skb1->acked)
1474 {
1475 break;
1476 }
1477
1478 skb_unlink(skb1);
1479 kfree_skb(skb1, FREE_READ);
1480 }
1481 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
1482 sk->ack_backlog++;
1483 tcp_reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, HZ/2));
1484 }
1485
1486 /*
1487 * Now tell the user we may have some data.
1488 */
1489
1490 if (!sk->dead)
1491 {
1492 if(sk->debug)
1493 printk("Data wakeup.\n");
1494 sk->data_ready(sk,0);
1495 }
1496 return(0);
1497 }
1498
1499
1500 /*
1501 * This routine is only called when we have urgent data
1502 * signalled. Its the 'slow' part of tcp_urg. It could be
1503 * moved inline now as tcp_urg is only called from one
1504 * place. We handle URGent data wrong. We have to - as
1505 * BSD still doesn't use the correction from RFC961.
1506 */
1507
1508 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1509 {
1510 u32 ptr = ntohs(th->urg_ptr);
1511
1512 if (ptr)
1513 ptr--;
1514 ptr += ntohl(th->seq);
1515
1516 /* ignore urgent data that we've already seen and read */
1517 if (after(sk->copied_seq, ptr))
1518 return;
1519
1520 /* do we already have a newer (or duplicate) urgent pointer? */
1521 if (sk->urg_data && !after(ptr, sk->urg_seq))
1522 return;
1523
1524 /* tell the world about our new urgent pointer */
1525 if (sk->proc != 0) {
1526 if (sk->proc > 0) {
1527 kill_proc(sk->proc, SIGURG, 1);
1528 } else {
1529 kill_pg(-sk->proc, SIGURG, 1);
1530 }
1531 }
1532 sk->urg_data = URG_NOTYET;
1533 sk->urg_seq = ptr;
1534 }
1535
1536 /*
1537 * This is the 'fast' part of urgent handling.
1538 */
1539
1540 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1541 {
1542 /*
1543 * Check if we get a new urgent pointer - normally not
1544 */
1545
1546 if (th->urg)
1547 tcp_check_urg(sk,th);
1548
1549 /*
1550 * Do we wait for any urgent data? - normally not
1551 */
1552
1553 if (sk->urg_data == URG_NOTYET) {
1554 u32 ptr;
1555
1556 /*
1557 * Is the urgent pointer pointing into this packet?
1558 */
1559 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1560 if (ptr < len) {
1561 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1562 if (!sk->dead)
1563 sk->data_ready(sk,0);
1564 }
1565 }
1566 }
1567
1568
1569 /*
1570 * A TCP packet has arrived.
1571 * skb->h.raw is the TCP header.
1572 */
1573
1574 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1575 __u32 daddr, unsigned short len,
1576 __u32 saddr, int redo, struct inet_protocol * protocol)
1577 {
1578 struct tcphdr *th;
1579 struct sock *sk;
1580 int syn_ok=0;
1581
1582 /*
1583 * "redo" is 1 if we have already seen this skb but couldn't
1584 * use it at that time (the socket was locked). In that case
1585 * we have already done a lot of the work (looked up the socket
1586 * etc).
1587 */
1588 th = skb->h.th;
1589 sk = skb->sk;
1590 if (!redo) {
1591 tcp_statistics.TcpInSegs++;
1592 if (skb->pkt_type!=PACKET_HOST)
1593 goto discard_it;
1594
1595 /*
1596 * Pull up the IP header.
1597 */
1598
1599 skb_pull(skb, skb->h.raw-skb->data);
1600
1601 /*
1602 * Try to use the device checksum if provided.
1603 */
1604 switch (skb->ip_summed)
1605 {
1606 case CHECKSUM_NONE:
1607 skb->csum = csum_partial((char *)th, len, 0);
1608 case CHECKSUM_HW:
1609 if (tcp_check(th, len, saddr, daddr, skb->csum))
1610 goto discard_it;
1611 default:
1612 /* CHECKSUM_UNNECESSARY */
1613 }
1614 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1615 if (!sk)
1616 goto no_tcp_socket;
1617 skb->sk = sk;
1618 skb->seq = ntohl(th->seq);
1619 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1620 skb->ack_seq = ntohl(th->ack_seq);
1621
1622 skb->acked = 0;
1623 skb->used = 0;
1624 skb->free = 1;
1625 skb->saddr = daddr;
1626 skb->daddr = saddr;
1627
1628 /* We may need to add it to the backlog here. */
1629 if (sk->users)
1630 {
1631 skb_queue_tail(&sk->back_log, skb);
1632 return(0);
1633 }
1634 }
1635
1636 /*
1637 * If this socket has got a reset it's to all intents and purposes
1638 * really dead. Count closed sockets as dead.
1639 *
1640 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1641 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
1642 * exist so should cause resets as if the port was unreachable.
1643 */
1644
1645 if (sk->zapped || sk->state==TCP_CLOSE)
1646 goto no_tcp_socket;
1647
1648 if (!sk->prot)
1649 {
1650 printk("IMPOSSIBLE 3\n");
1651 return(0);
1652 }
1653
1654
1655 /*
1656 * Charge the memory to the socket.
1657 */
1658
1659 skb->sk=sk;
1660 sk->rmem_alloc += skb->truesize;
1661
1662 /*
1663 * We should now do header prediction.
1664 */
1665
1666 /*
1667 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1668 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1669 * compatibility. We also set up variables more thoroughly [Karn notes in the
1670 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1671 */
1672
1673 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
1674 {
1675
1676 /*
1677 * Now deal with unusual cases.
1678 */
1679
1680 if(sk->state==TCP_LISTEN)
1681 {
1682 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
1683 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1684
1685 /*
1686 * We don't care for RST, and non SYN are absorbed (old segments)
1687 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1688 * netmask on a running connection it can go broadcast. Even Sun's have
1689 * this problem so I'm ignoring it
1690 */
1691
1692 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1693 {
1694 kfree_skb(skb, FREE_READ);
1695 return 0;
1696 }
1697
1698 /*
1699 * Guess we need to make a new socket up
1700 */
1701
1702 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1703
1704 /*
1705 * Now we have several options: In theory there is nothing else
1706 * in the frame. KA9Q has an option to send data with the syn,
1707 * BSD accepts data with the syn up to the [to be] advertised window
1708 * and Solaris 2.1 gives you a protocol error. For now we just ignore
1709 * it, that fits the spec precisely and avoids incompatibilities. It
1710 * would be nice in future to drop through and process the data.
1711 *
1712 * Now TTCP is starting to use we ought to queue this data.
1713 */
1714
1715 return 0;
1716 }
1717
1718 /*
1719 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1720 * then its a new connection
1721 */
1722
1723 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1724 {
1725 kfree_skb(skb, FREE_READ);
1726 return 0;
1727 }
1728
1729 /*
1730 * SYN sent means we have to look for a suitable ack and either reset
1731 * for bad matches or go to connected. The SYN_SENT case is unusual and should
1732 * not be in line code. [AC]
1733 */
1734
1735 if(sk->state==TCP_SYN_SENT)
1736 {
1737 /* Crossed SYN or previous junk segment */
1738 if(th->ack)
1739 {
1740 /* We got an ack, but it's not a good ack */
1741 if(!tcp_ack(sk,th,skb->ack_seq,len))
1742 {
1743 /* Reset the ack - its an ack from a
1744 different connection [ th->rst is checked in tcp_send_reset()] */
1745 tcp_statistics.TcpAttemptFails++;
1746 tcp_send_reset(daddr, saddr, th,
1747 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1748 kfree_skb(skb, FREE_READ);
1749 return(0);
1750 }
1751 if(th->rst)
1752 return tcp_reset(sk,skb);
1753 if(!th->syn)
1754 {
1755 /* A valid ack from a different connection
1756 start. Shouldn't happen but cover it */
1757 tcp_statistics.TcpAttemptFails++;
1758 tcp_send_reset(daddr, saddr, th,
1759 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1760 kfree_skb(skb, FREE_READ);
1761 return 0;
1762 }
1763 /*
1764 * Ok.. it's good. Set up sequence numbers and
1765 * move to established.
1766 */
1767 syn_ok=1; /* Don't reset this connection for the syn */
1768 sk->acked_seq = skb->seq+1;
1769 sk->lastwin_seq = skb->seq+1;
1770 sk->fin_seq = skb->seq;
1771 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
1772 tcp_set_state(sk, TCP_ESTABLISHED);
1773 tcp_options(sk,th);
1774 sk->dummy_th.dest=th->source;
1775 sk->copied_seq = sk->acked_seq;
1776 if(!sk->dead)
1777 {
1778 sk->state_change(sk);
1779 sock_wake_async(sk->socket, 0);
1780 }
1781 if(sk->max_window==0)
1782 {
1783 sk->max_window = 32;
1784 sk->mss = min(sk->max_window, sk->mtu);
1785 }
1786 }
1787 else
1788 {
1789 /* See if SYN's cross. Drop if boring */
1790 if(th->syn && !th->rst)
1791 {
1792 /* Crossed SYN's are fine - but talking to
1793 yourself is right out... */
1794 if(sk->saddr==saddr && sk->daddr==daddr &&
1795 sk->dummy_th.source==th->source &&
1796 sk->dummy_th.dest==th->dest)
1797 {
1798 tcp_statistics.TcpAttemptFails++;
1799 return tcp_reset(sk,skb);
1800 }
1801 tcp_set_state(sk,TCP_SYN_RECV);
1802
1803 /*
1804 * FIXME:
1805 * Must send SYN|ACK here
1806 */
1807 }
1808 /* Discard junk segment */
1809 kfree_skb(skb, FREE_READ);
1810 return 0;
1811 }
1812 /*
1813 * SYN_RECV with data maybe.. drop through
1814 */
1815 goto rfc_step6;
1816 }
1817
1818 /*
1819 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1820 * a more complex suggestion for fixing these reuse issues in RFC1644
1821 * but not yet ready for general use. Also see RFC1379.
1822 *
1823 * Note the funny way we go back to the top of this function for
1824 * this case ("goto try_next_socket"). That also takes care of
1825 * checking "sk->users" for the new socket as well as doing all
1826 * the normal tests on the packet.
1827 */
1828
1829 #define BSD_TIME_WAIT
1830 #ifdef BSD_TIME_WAIT
1831 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1832 after(skb->seq, sk->acked_seq) && !th->rst)
1833 {
1834 u32 seq = sk->write_seq;
1835 if(sk->debug)
1836 printk("Doing a BSD time wait\n");
1837 tcp_statistics.TcpEstabResets++;
1838 sk->rmem_alloc -= skb->truesize;
1839 skb->sk = NULL;
1840 sk->err=ECONNRESET;
1841 tcp_set_state(sk, TCP_CLOSE);
1842 sk->shutdown = SHUTDOWN_MASK;
1843 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1844 /* this is not really correct: we should check sk->users */
1845 if (sk && sk->state==TCP_LISTEN)
1846 {
1847 skb->sk = sk;
1848 sk->rmem_alloc += skb->truesize;
1849 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1850 return 0;
1851 }
1852 kfree_skb(skb, FREE_READ);
1853 return 0;
1854 }
1855 #endif
1856 }
1857
1858 /*
1859 * We are now in normal data flow (see the step list in the RFC)
1860 * Note most of these are inline now. I'll inline the lot when
1861 * I have time to test it hard and look at what gcc outputs
1862 */
1863
1864 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1865 {
1866 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
1867 kfree_skb(skb, FREE_READ);
1868 return 0;
1869 }
1870
1871 if(th->rst)
1872 return tcp_reset(sk,skb);
1873
1874 /*
1875 * !syn_ok is effectively the state test in RFC793.
1876 */
1877
1878 if(th->syn && !syn_ok)
1879 {
1880 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1881 return tcp_reset(sk,skb);
1882 }
1883
1884 tcp_delack_estimator(sk);
1885
1886 /*
1887 * Process the ACK
1888 */
1889
1890
1891 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1892 {
1893 /*
1894 * Our three way handshake failed.
1895 */
1896
1897 if(sk->state==TCP_SYN_RECV)
1898 {
1899 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1900 }
1901 kfree_skb(skb, FREE_READ);
1902 return 0;
1903 }
1904
1905 rfc_step6: /* I'll clean this up later */
1906
1907 /*
1908 * If the accepted buffer put us over our queue size we
1909 * now drop it (we must process the ack first to avoid
1910 * deadlock cases).
1911 */
1912
1913 if (sk->rmem_alloc >= sk->rcvbuf)
1914 {
1915 kfree_skb(skb, FREE_READ);
1916 return(0);
1917 }
1918
1919
1920 /*
1921 * Process urgent data
1922 */
1923
1924 tcp_urg(sk, th, len);
1925
1926 /*
1927 * Process the encapsulated data
1928 */
1929
1930 if(tcp_data(skb,sk, saddr, len))
1931 kfree_skb(skb, FREE_READ);
1932
1933 /*
1934 * And done
1935 */
1936
1937 return 0;
1938
1939 no_tcp_socket:
1940 /*
1941 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1942 */
1943 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1944
1945 discard_it:
1946 /*
1947 * Discard frame
1948 */
1949 skb->sk = NULL;
1950 kfree_skb(skb, FREE_READ);
1951 return 0;
1952 }