1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp_input.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * FIXES
23 * Pedro Roque : Double ACK bug
24 */
25
26 #include <linux/config.h>
27 #include <net/tcp.h>
28
29 /*
30 * Policy code extracted so its now separate
31 */
32
33 /*
34 * Called each time to estimate the delayed ack timeout. This is
35 * how it should be done so a fast link isn't impacted by ack delay.
36 */
37
38 extern __inline__ void tcp_delack_estimator(struct sock *sk)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
39 {
40 /*
41 * Delayed ACK time estimator.
42 */
43
44 if (sk->lrcvtime == 0)
45 {
46 sk->lrcvtime = jiffies;
47 sk->ato = HZ/3;
48 }
49 else
50 {
51 int m;
52
53 m = jiffies - sk->lrcvtime;
54
55 sk->lrcvtime = jiffies;
56
57 if (m <= 0)
58 m = 1;
59
60 if (m > (sk->rtt >> 3))
61 {
62 sk->ato = sk->rtt >> 3;
63 /*
64 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
65 */
66 }
67 else
68 {
69 sk->ato = (sk->ato >> 1) + m;
70 /*
71 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
72 */
73 }
74 }
75 }
76
77 /*
78 * Called on frames that were known _not_ to have been
79 * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87].
80 * The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
81 */
82
83 extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
84 {
85 long m;
86 /*
87 * The following amusing code comes from Jacobson's
88 * article in SIGCOMM '88. Note that rtt and mdev
89 * are scaled versions of rtt and mean deviation.
90 * This is designed to be as fast as possible
91 * m stands for "measurement".
92 */
93
94 m = jiffies - oskb->when; /* RTT */
95 if (sk->rtt != 0) {
96 if(m<=0)
97 m=1; /* IS THIS RIGHT FOR <0 ??? */
98 m -= (sk->rtt >> 3); /* m is now error in rtt est */
99 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
100 if (m < 0)
101 m = -m; /* m is now abs(error) */
102 m -= (sk->mdev >> 2); /* similar update on mdev */
103 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
104 } else {
105 /* no previous measure. */
106 sk->rtt = m<<3; /* take the measured time to be rtt */
107 sk->mdev = m<<2; /* make sure rto = 3*rtt */
108 }
109
110 /*
111 * Now update timeout. Note that this removes any backoff.
112 */
113
114 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
115 if (sk->rto > 120*HZ)
116 sk->rto = 120*HZ;
117 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
118 sk->rto = HZ/5;
119 sk->backoff = 0;
120 }
121
122 /*
123 * Cached last hit socket
124 */
125
126 static volatile unsigned long th_cache_saddr, th_cache_daddr;
127 static volatile unsigned short th_cache_dport, th_cache_sport;
128 static volatile struct sock *th_cache_sk;
129
130 void tcp_cache_zap(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
131 {
132 th_cache_sk=NULL;
133 }
134
135 /*
136 * Find the socket, using the last hit cache if applicable. The cache is not quite
137 * right...
138 */
139
140 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
141 {
142 struct sock * sk;
143
144 sk = (struct sock *) th_cache_sk;
145 if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
146 sport != th_cache_sport || dport != th_cache_dport) {
147 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
148 if (sk) {
149 th_cache_saddr=saddr;
150 th_cache_daddr=daddr;
151 th_cache_dport=dport;
152 th_cache_sport=sport;
153 th_cache_sk=sk;
154 }
155 }
156 return sk;
157 }
158
159 /*
160 * React to a out-of-window TCP sequence number in an incoming packet
161 */
162
163 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
164 struct device *dev)
165 {
166 if (th->rst)
167 return;
168
169 /*
170 * Send a reset if we get something not ours and we are
171 * unsynchronized. Note: We don't do anything to our end. We
172 * are just killing the bogus remote connection then we will
173 * connect again and it will work (with luck).
174 */
175
176 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
177 {
178 tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
179 return;
180 }
181
182 /*
183 * 4.3reno machines look for these kind of acks so they can do fast
184 * recovery. Three identical 'old' acks lets it know that one frame has
185 * been lost and should be resent. Because this is before the whole window
186 * of data has timed out it can take one lost frame per window without
187 * stalling. [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
188 */
189 tcp_send_ack(sk);
190 }
191
192 /*
193 * This functions checks to see if the tcp header is actually acceptable.
194 */
195
196 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
197 {
198 u32 end_window = sk->acked_seq + sk->window;
199 return /* if start is at end of window, end must be too (zero window) */
200 (seq == end_window && seq == end_seq) ||
201 /* if start is before end of window, check for interest */
202 (before(seq, end_window) && !before(end_seq, sk->acked_seq));
203 }
204
205 /*
206 * When we get a reset we do this. This probably is a tcp_output routine
207 * really.
208 */
209
210 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
211 {
212 sk->zapped = 1;
213 /*
214 * We want the right error as BSD sees it (and indeed as we do).
215 */
216 sk->err = ECONNRESET;
217 if (sk->state == TCP_SYN_SENT)
218 sk->err = ECONNREFUSED;
219 if (sk->state == TCP_CLOSE_WAIT)
220 sk->err = EPIPE;
221 #ifdef CONFIG_TCP_RFC1337
222 /*
223 * Time wait assassination protection [RFC1337]
224 *
225 * This is a good idea, but causes more sockets to take time to close.
226 *
227 * Ian Heavens has since shown this is an inadequate fix for the protocol
228 * bug in question.
229 */
230 if(sk->state!=TCP_TIME_WAIT)
231 {
232 tcp_set_state(sk,TCP_CLOSE);
233 sk->shutdown = SHUTDOWN_MASK;
234 }
235 #else
236 tcp_set_state(sk,TCP_CLOSE);
237 sk->shutdown = SHUTDOWN_MASK;
238 #endif
239 if (!sk->dead)
240 sk->state_change(sk);
241 kfree_skb(skb, FREE_READ);
242 return(0);
243 }
244
245
246 /*
247 * Look for tcp options. Parses everything but only knows about MSS.
248 * This routine is always called with the packet containing the SYN.
249 * However it may also be called with the ack to the SYN. So you
250 * can't assume this is always the SYN. It's always called after
251 * we have set up sk->mtu to our own MTU.
252 *
253 * We need at minimum to add PAWS support here. Possibly large windows
254 * as Linux gets deployed on 100Mb/sec networks.
255 */
256
257 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
258 {
259 unsigned char *ptr;
260 int length=(th->doff*4)-sizeof(struct tcphdr);
261 int mss_seen = 0;
262
263 ptr = (unsigned char *)(th + 1);
264
265 while(length>0)
266 {
267 int opcode=*ptr++;
268 int opsize=*ptr++;
269 switch(opcode)
270 {
271 case TCPOPT_EOL:
272 return;
273 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
274 length--;
275 ptr--; /* the opsize=*ptr++ above was a mistake */
276 continue;
277
278 default:
279 if(opsize<=2) /* Avoid silly options looping forever */
280 return;
281 switch(opcode)
282 {
283 case TCPOPT_MSS:
284 if(opsize==4 && th->syn)
285 {
286 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
287 mss_seen = 1;
288 }
289 break;
290 /* Add other options here as people feel the urge to implement stuff like large windows */
291 }
292 ptr+=opsize-2;
293 length-=opsize;
294 }
295 }
296 if (th->syn)
297 {
298 if (! mss_seen)
299 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
300 }
301 #ifdef CONFIG_INET_PCTCP
302 sk->mss = min(sk->max_window >> 1, sk->mtu);
303 #else
304 sk->mss = min(sk->max_window, sk->mtu);
305 sk->max_unacked = 2 * sk->mss;
306 #endif
307 }
308
309
310 /*
311 * This routine handles a connection request.
312 * It should make sure we haven't already responded.
313 * Because of the way BSD works, we have to send a syn/ack now.
314 * This also means it will be harder to close a socket which is
315 * listening.
316 */
317
318 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
319 u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
320 {
321 struct sock *newsk;
322 struct tcphdr *th;
323 struct rtable *rt;
324
325 th = skb->h.th;
326
327 /* If the socket is dead, don't accept the connection. */
328 if (!sk->dead)
329 {
330 sk->data_ready(sk,0);
331 }
332 else
333 {
334 if(sk->debug)
335 printk("Reset on %p: Connect on dead socket.\n",sk);
336 tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
337 tcp_statistics.TcpAttemptFails++;
338 kfree_skb(skb, FREE_READ);
339 return;
340 }
341
342 /*
343 * Make sure we can accept more. This will prevent a
344 * flurry of syns from eating up all our memory.
345 *
346 * BSD does some funnies here and allows 3/2 times the
347 * set backlog as a fudge factor. Thats just too gross.
348 */
349
350 if (sk->ack_backlog >= sk->max_ack_backlog)
351 {
352 tcp_statistics.TcpAttemptFails++;
353 kfree_skb(skb, FREE_READ);
354 return;
355 }
356
357 /*
358 * We need to build a new sock struct.
359 * It is sort of bad to have a socket without an inode attached
360 * to it, but the wake_up's will just wake up the listening socket,
361 * and if the listening socket is destroyed before this is taken
362 * off of the queue, this will take care of it.
363 */
364
365 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
366 if (newsk == NULL)
367 {
368 /* just ignore the syn. It will get retransmitted. */
369 tcp_statistics.TcpAttemptFails++;
370 kfree_skb(skb, FREE_READ);
371 return;
372 }
373
374 memcpy(newsk, sk, sizeof(*newsk));
375 newsk->opt = NULL;
376 newsk->ip_route_cache = NULL;
377 if (opt && opt->optlen)
378 {
379 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
380 if (!sk->opt)
381 {
382 kfree_s(newsk, sizeof(struct sock));
383 tcp_statistics.TcpAttemptFails++;
384 kfree_skb(skb, FREE_READ);
385 return;
386 }
387 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb))
388 {
389 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
390 kfree_s(newsk, sizeof(struct sock));
391 tcp_statistics.TcpAttemptFails++;
392 kfree_skb(skb, FREE_READ);
393 return;
394 }
395 }
396 skb_queue_head_init(&newsk->write_queue);
397 skb_queue_head_init(&newsk->receive_queue);
398 newsk->send_head = NULL;
399 newsk->send_tail = NULL;
400 skb_queue_head_init(&newsk->back_log);
401 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
402 newsk->rto = TCP_TIMEOUT_INIT;
403 newsk->mdev = TCP_TIMEOUT_INIT<<1;
404 newsk->max_window = 0;
405 newsk->cong_window = 1;
406 newsk->cong_count = 0;
407 newsk->ssthresh = 0;
408 newsk->backoff = 0;
409 newsk->blog = 0;
410 newsk->intr = 0;
411 newsk->proc = 0;
412 newsk->done = 0;
413 newsk->partial = NULL;
414 newsk->pair = NULL;
415 newsk->wmem_alloc = 0;
416 newsk->rmem_alloc = 0;
417 newsk->localroute = sk->localroute;
418
419 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
420
421 newsk->err = 0;
422 newsk->shutdown = 0;
423 newsk->ack_backlog = 0;
424 newsk->acked_seq = skb->seq+1;
425 newsk->lastwin_seq = skb->seq+1;
426 newsk->delay_acks = 1;
427 newsk->copied_seq = skb->seq+1;
428 newsk->fin_seq = skb->seq;
429 newsk->state = TCP_SYN_RECV;
430 newsk->timeout = 0;
431 newsk->ip_xmit_timeout = 0;
432 newsk->write_seq = seq;
433 newsk->window_seq = newsk->write_seq;
434 newsk->rcv_ack_seq = newsk->write_seq;
435 newsk->urg_data = 0;
436 newsk->retransmits = 0;
437 newsk->linger=0;
438 newsk->destroy = 0;
439 init_timer(&newsk->timer);
440 newsk->timer.data = (unsigned long)newsk;
441 newsk->timer.function = &net_timer;
442 init_timer(&newsk->delack_timer);
443 newsk->delack_timer.data = (unsigned long)newsk;
444 newsk->delack_timer.function = tcp_delack_timer;
445 init_timer(&newsk->retransmit_timer);
446 newsk->retransmit_timer.data = (unsigned long)newsk;
447 newsk->retransmit_timer.function = tcp_retransmit_timer;
448 newsk->dummy_th.source = skb->h.th->dest;
449 newsk->dummy_th.dest = skb->h.th->source;
450
451 /*
452 * Swap these two, they are from our point of view.
453 */
454
455 newsk->daddr = saddr;
456 newsk->saddr = daddr;
457 newsk->rcv_saddr = daddr;
458
459 put_sock(newsk->num,newsk);
460 newsk->acked_seq = skb->seq + 1;
461 newsk->copied_seq = skb->seq + 1;
462 newsk->socket = NULL;
463
464 /*
465 * Grab the ttl and tos values and use them
466 */
467
468 newsk->ip_ttl=sk->ip_ttl;
469 newsk->ip_tos=skb->ip_hdr->tos;
470
471 /*
472 * Use 512 or whatever user asked for
473 */
474
475 /*
476 * Note use of sk->user_mss, since user has no direct access to newsk
477 */
478
479 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
480 newsk->ip_route_cache = rt;
481
482 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
483 newsk->window_clamp = rt->rt_window;
484 else
485 newsk->window_clamp = 0;
486
487 if (sk->user_mss)
488 newsk->mtu = sk->user_mss;
489 else if (rt)
490 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
491 else
492 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
493
494 /*
495 * But not bigger than device MTU
496 */
497
498 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
499
500 #ifdef CONFIG_SKIP
501
502 /*
503 * SKIP devices set their MTU to 65535. This is so they can take packets
504 * unfragmented to security process then fragment. They could lie to the
505 * TCP layer about a suitable MTU, but its easier to let skip sort it out
506 * simply because the final package we want unfragmented is going to be
507 *
508 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
509 */
510
511 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
512 sk->mtu=skip_pick_mtu(sk->mtu,dev);
513 #endif
514 /*
515 * This will min with what arrived in the packet
516 */
517
518 tcp_options(newsk,skb->h.th);
519
520 tcp_cache_zap();
521 tcp_send_synack(newsk, sk, skb);
522 }
523
524
525 /*
526 * Handle a TCP window that shrunk on us. It shouldn't happen,
527 * but..
528 *
529 * We may need to move packets from the send queue
530 * to the write queue, if the window has been shrunk on us.
531 * The RFC says you are not allowed to shrink your window
532 * like this, but if the other end does, you must be able
533 * to deal with it.
534 */
535 void tcp_window_shrunk(struct sock * sk, u32 window_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
536 {
537 struct sk_buff *skb;
538 struct sk_buff *skb2;
539 struct sk_buff *wskb = NULL;
540
541 skb2 = sk->send_head;
542 sk->send_head = NULL;
543 sk->send_tail = NULL;
544
545 /*
546 * This is an artifact of a flawed concept. We want one
547 * queue and a smarter send routine when we send all.
548 */
549 cli();
550 while (skb2 != NULL)
551 {
552 skb = skb2;
553 skb2 = skb->link3;
554 skb->link3 = NULL;
555 if (after(skb->end_seq, window_seq))
556 {
557 if (sk->packets_out > 0)
558 sk->packets_out--;
559 /* We may need to remove this from the dev send list. */
560 if (skb->next != NULL)
561 {
562 skb_unlink(skb);
563 }
564 /* Now add it to the write_queue. */
565 if (wskb == NULL)
566 skb_queue_head(&sk->write_queue,skb);
567 else
568 skb_append(wskb,skb);
569 wskb = skb;
570 }
571 else
572 {
573 if (sk->send_head == NULL)
574 {
575 sk->send_head = skb;
576 sk->send_tail = skb;
577 }
578 else
579 {
580 sk->send_tail->link3 = skb;
581 sk->send_tail = skb;
582 }
583 skb->link3 = NULL;
584 }
585 }
586 sti();
587 }
588
589
590 /*
591 * This routine deals with incoming acks, but not outgoing ones.
592 *
593 * This routine is totally _WRONG_. The list structuring is wrong,
594 * the algorithm is wrong, the code is wrong.
595 */
596
597 static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
598 {
599 int flag = 0;
600 u32 window_seq;
601
602 /*
603 * 1 - there was data in packet as well as ack or new data is sent or
604 * in shutdown state
605 * 2 - data from retransmit queue was acked and removed
606 * 4 - window shrunk or data from retransmit queue was acked and removed
607 */
608
609 if(sk->zapped)
610 return(1); /* Dead, cant ack any more so why bother */
611
612 /*
613 * We have dropped back to keepalive timeouts. Thus we have
614 * no retransmits pending.
615 */
616
617 if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
618 sk->retransmits = 0;
619
620 /*
621 * If the ack is newer than sent or older than previous acks
622 * then we can probably ignore it.
623 */
624
625 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
626 goto uninteresting_ack;
627
628 /*
629 * If there is data set flag 1
630 */
631
632 if (len != th->doff*4)
633 flag |= 1;
634
635 /*
636 * Have we discovered a larger window
637 */
638 window_seq = ntohs(th->window);
639 if (window_seq > sk->max_window)
640 {
641 sk->max_window = window_seq;
642 #ifdef CONFIG_INET_PCTCP
643 /* Hack because we don't send partial packets to non SWS
644 handling hosts */
645 sk->mss = min(window_seq>>1, sk->mtu);
646 #else
647 sk->mss = min(window_seq, sk->mtu);
648 #endif
649 }
650 window_seq += ack;
651
652 /*
653 * See if our window has been shrunk.
654 */
655 if (after(sk->window_seq, window_seq)) {
656 flag |= 4;
657 tcp_window_shrunk(sk, window_seq);
658 }
659
660 /*
661 * Pipe has emptied
662 */
663 if (sk->send_tail == NULL || sk->send_head == NULL)
664 {
665 sk->send_head = NULL;
666 sk->send_tail = NULL;
667 sk->packets_out= 0;
668 }
669
670 /*
671 * We don't want too many packets out there.
672 */
673
674 if (sk->ip_xmit_timeout == TIME_WRITE &&
675 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
676 {
677
678 /*
679 * This is Jacobson's slow start and congestion avoidance.
680 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
681 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
682 * counter and increment it once every cwnd times. It's possible
683 * that this should be done only if sk->retransmits == 0. I'm
684 * interpreting "new data is acked" as including data that has
685 * been retransmitted but is just now being acked.
686 */
687 if (sk->cong_window < sk->ssthresh)
688 /*
689 * In "safe" area, increase
690 */
691 sk->cong_window++;
692 else
693 {
694 /*
695 * In dangerous area, increase slowly. In theory this is
696 * sk->cong_window += 1 / sk->cong_window
697 */
698 if (sk->cong_count >= sk->cong_window)
699 {
700 sk->cong_window++;
701 sk->cong_count = 0;
702 }
703 else
704 sk->cong_count++;
705 }
706 }
707
708 /*
709 * Remember the highest ack received and update the
710 * right hand window edge of the host.
711 * We do a bit of work here to track number of times we've
712 * seen this ack without a change in the right edge of the
713 * window and no data in the packet.
714 * This will allow us to do fast retransmits.
715 */
716
717 /* We are looking for duplicate ACKs here.
718 * An ACK is a duplicate if:
719 * (1) it has the same sequence number as the largest number we've seen,
720 * (2) it has the same window as the last ACK,
721 * (3) we have outstanding data that has not been ACKed
722 * (4) The packet was not carrying any data.
723 * I've tried to order these in occurrence of most likely to fail
724 * to least likely to fail.
725 * [These are the rules BSD stacks use to determine if an ACK is a
726 * duplicate.]
727 */
728
729 if (sk->rcv_ack_seq == ack
730 && sk->window_seq == window_seq
731 && !(flag&1)
732 && before(ack, sk->sent_seq))
733 {
734 /* See draft-stevens-tcpca-spec-01 for explanation
735 * of what we are doing here.
736 */
737 sk->rcv_ack_cnt++;
738 if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
739 sk->ssthresh = max(sk->cong_window >> 1, 2);
740 sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
741 tcp_do_retransmit(sk,0);
742 /* reduce the count. We don't want to be
743 * seen to be in "retransmit" mode if we
744 * are doing a fast retransmit.
745 */
746 sk->retransmits--;
747 } else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
748 sk->cong_window++;
749 /*
750 * At this point we are suppose to transmit a NEW
751 * packet (not retransmit the missing packet,
752 * this would only get us into a retransmit war.)
753 * I think that having just adjusted cong_window
754 * we will transmit the new packet below.
755 */
756 }
757 }
758 else
759 {
760 if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
761 sk->cong_window = sk->ssthresh;
762 }
763 sk->window_seq = window_seq;
764 sk->rcv_ack_seq = ack;
765 sk->rcv_ack_cnt = 1;
766 }
767
768 /*
769 * We passed data and got it acked, remove any soft error
770 * log. Something worked...
771 */
772
773 sk->err_soft = 0;
774
775 /*
776 * If this ack opens up a zero window, clear backoff. It was
777 * being used to time the probes, and is probably far higher than
778 * it needs to be for normal retransmission.
779 */
780
781 if (sk->ip_xmit_timeout == TIME_PROBE0)
782 {
783 sk->retransmits = 0; /* Our probe was answered */
784
785 /*
786 * Was it a usable window open ?
787 */
788
789 if (!skb_queue_empty(&sk->write_queue) && /* should always be true */
790 ! before (sk->window_seq, sk->write_queue.next->end_seq))
791 {
792 sk->backoff = 0;
793
794 /*
795 * Recompute rto from rtt. this eliminates any backoff.
796 */
797
798 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
799 if (sk->rto > 120*HZ)
800 sk->rto = 120*HZ;
801 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
802 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
803 .2 of a second is going to need huge windows (SIGH) */
804 sk->rto = HZ/5;
805 }
806 }
807
808 /*
809 * See if we can take anything off of the retransmit queue.
810 */
811
812 for (;;) {
813 struct sk_buff * skb = sk->send_head;
814 if (!skb)
815 break;
816
817 /* Check for a bug. */
818 if (skb->link3 && after(skb->end_seq, skb->link3->end_seq))
819 printk("INET: tcp.c: *** bug send_list out of order.\n");
820
821 /*
822 * If our packet is before the ack sequence we can
823 * discard it as it's confirmed to have arrived the other end.
824 */
825
826 if (after(skb->end_seq, ack))
827 break;
828
829 if (sk->retransmits)
830 {
831 /*
832 * We were retransmitting. don't count this in RTT est
833 */
834 flag |= 2;
835 }
836
837 if ((sk->send_head = skb->link3) == NULL)
838 {
839 sk->send_tail = NULL;
840 sk->retransmits = 0;
841 }
842 /*
843 * Note that we only reset backoff and rto in the
844 * rtt recomputation code. And that doesn't happen
845 * if there were retransmissions in effect. So the
846 * first new packet after the retransmissions is
847 * sent with the backoff still in effect. Not until
848 * we get an ack from a non-retransmitted packet do
849 * we reset the backoff and rto. This allows us to deal
850 * with a situation where the network delay has increased
851 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
852 */
853
854 /*
855 * We have one less packet out there.
856 */
857
858 if (sk->packets_out > 0)
859 sk->packets_out --;
860
861 if (!(flag&2)) /* Not retransmitting */
862 tcp_rtt_estimator(sk,skb);
863 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
864 In this case as we just set it up */
865 IS_SKB(skb);
866
867 /*
868 * We may need to remove this from the dev send list.
869 */
870 cli();
871 if (skb->next)
872 skb_unlink(skb);
873 sti();
874 kfree_skb(skb, FREE_WRITE); /* write. */
875 if (!sk->dead)
876 sk->write_space(sk);
877 }
878
879 /*
880 * XXX someone ought to look at this too.. at the moment, if skb_peek()
881 * returns non-NULL, we complete ignore the timer stuff in the else
882 * clause. We ought to organize the code so that else clause can
883 * (should) be executed regardless, possibly moving the PROBE timer
884 * reset over. The skb_peek() thing should only move stuff to the
885 * write queue, NOT also manage the timer functions.
886 */
887
888 /*
889 * Maybe we can take some stuff off of the write queue,
890 * and put it onto the xmit queue.
891 */
892 if (skb_peek(&sk->write_queue) != NULL)
893 {
894 if (!before(sk->window_seq, sk->write_queue.next->end_seq) &&
895 (sk->retransmits == 0 ||
896 sk->ip_xmit_timeout != TIME_WRITE ||
897 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq))
898 && sk->packets_out < sk->cong_window)
899 {
900 /*
901 * Add more data to the send queue.
902 */
903 flag |= 1;
904 tcp_write_xmit(sk);
905 }
906 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
907 sk->send_head == NULL &&
908 sk->ack_backlog == 0 &&
909 sk->state != TCP_TIME_WAIT)
910 {
911 /*
912 * Data to queue but no room.
913 */
914 tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
915 }
916 }
917 else
918 {
919 /*
920 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
921 * from TCP_CLOSE we don't do anything
922 *
923 * from anything else, if there is write data (or fin) pending,
924 * we use a TIME_WRITE timeout, else if keepalive we reset to
925 * a KEEPALIVE timeout, else we delete the timer.
926 *
927 * We do not set flag for nominal write data, otherwise we may
928 * force a state where we start to write itsy bitsy tidbits
929 * of data.
930 */
931
932 switch(sk->state) {
933 case TCP_TIME_WAIT:
934 /*
935 * keep us in TIME_WAIT until we stop getting packets,
936 * reset the timeout.
937 */
938 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
939 break;
940 case TCP_CLOSE:
941 /*
942 * don't touch the timer.
943 */
944 break;
945 default:
946 /*
947 * Must check send_head and write_queue
948 * to determine which timeout to use.
949 */
950 if (sk->send_head || !skb_queue_empty(&sk->write_queue)) {
951 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
952 } else if (sk->keepopen) {
953 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
954 } else {
955 del_timer(&sk->retransmit_timer);
956 sk->ip_xmit_timeout = 0;
957 }
958 break;
959 }
960 }
961
962 /*
963 * We have nothing queued but space to send. Send any partial
964 * packets immediately (end of Nagle rule application).
965 */
966
967 if (sk->packets_out == 0
968 && sk->partial != NULL
969 && skb_queue_empty(&sk->write_queue)
970 && sk->send_head == NULL)
971 {
972 flag |= 1;
973 tcp_send_partial(sk);
974 }
975
976 /*
977 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
978 * we are now waiting for an acknowledge to our FIN. The other end is
979 * already in TIME_WAIT.
980 *
981 * Move to TCP_CLOSE on success.
982 */
983
984 if (sk->state == TCP_LAST_ACK)
985 {
986 if (!sk->dead)
987 sk->state_change(sk);
988 if(sk->debug)
989 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
990 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
991 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
992 {
993 flag |= 1;
994 sk->shutdown = SHUTDOWN_MASK;
995 tcp_set_state(sk,TCP_CLOSE);
996 return 1;
997 }
998 }
999
1000 /*
1001 * Incoming ACK to a FIN we sent in the case of our initiating the close.
1002 *
1003 * Move to FIN_WAIT2 to await a FIN from the other end. Set
1004 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
1005 */
1006
1007 if (sk->state == TCP_FIN_WAIT1)
1008 {
1009
1010 if (!sk->dead)
1011 sk->state_change(sk);
1012 if (sk->rcv_ack_seq == sk->write_seq)
1013 {
1014 flag |= 1;
1015 sk->shutdown |= SEND_SHUTDOWN;
1016 tcp_set_state(sk, TCP_FIN_WAIT2);
1017 }
1018 }
1019
1020 /*
1021 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
1022 *
1023 * Move to TIME_WAIT
1024 */
1025
1026 if (sk->state == TCP_CLOSING)
1027 {
1028
1029 if (!sk->dead)
1030 sk->state_change(sk);
1031 if (sk->rcv_ack_seq == sk->write_seq)
1032 {
1033 flag |= 1;
1034 tcp_time_wait(sk);
1035 }
1036 }
1037
1038 /*
1039 * Final ack of a three way shake
1040 */
1041
1042 if(sk->state==TCP_SYN_RECV)
1043 {
1044 tcp_set_state(sk, TCP_ESTABLISHED);
1045 tcp_options(sk,th);
1046 sk->dummy_th.dest=th->source;
1047 sk->copied_seq = sk->acked_seq;
1048 if(!sk->dead)
1049 sk->state_change(sk);
1050 if(sk->max_window==0)
1051 {
1052 sk->max_window=32; /* Sanity check */
1053 sk->mss=min(sk->max_window,sk->mtu);
1054 }
1055 }
1056
1057 /*
1058 * I make no guarantees about the first clause in the following
1059 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
1060 * what conditions "!flag" would be true. However I think the rest
1061 * of the conditions would prevent that from causing any
1062 * unnecessary retransmission.
1063 * Clearly if the first packet has expired it should be
1064 * retransmitted. The other alternative, "flag&2 && retransmits", is
1065 * harder to explain: You have to look carefully at how and when the
1066 * timer is set and with what timeout. The most recent transmission always
1067 * sets the timer. So in general if the most recent thing has timed
1068 * out, everything before it has as well. So we want to go ahead and
1069 * retransmit some more. If we didn't explicitly test for this
1070 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
1071 * would not be true. If you look at the pattern of timing, you can
1072 * show that rto is increased fast enough that the next packet would
1073 * almost never be retransmitted immediately. Then you'd end up
1074 * waiting for a timeout to send each packet on the retransmission
1075 * queue. With my implementation of the Karn sampling algorithm,
1076 * the timeout would double each time. The net result is that it would
1077 * take a hideous amount of time to recover from a single dropped packet.
1078 * It's possible that there should also be a test for TIME_WRITE, but
1079 * I think as long as "send_head != NULL" and "retransmit" is on, we've
1080 * got to be in real retransmission mode.
1081 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
1082 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
1083 * As long as no further losses occur, this seems reasonable.
1084 */
1085
1086 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
1087 (((flag&2) && sk->retransmits) ||
1088 (sk->send_head->when + sk->rto < jiffies)))
1089 {
1090 if(sk->send_head->when + sk->rto < jiffies)
1091 tcp_retransmit(sk,0);
1092 else
1093 {
1094 tcp_do_retransmit(sk, 1);
1095 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1096 }
1097 }
1098
1099 return 1;
1100
1101 uninteresting_ack:
1102 if(sk->debug)
1103 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
1104
1105 /*
1106 * Keepalive processing.
1107 */
1108
1109 if (after(ack, sk->sent_seq))
1110 {
1111 return 0;
1112 }
1113
1114 /*
1115 * Restart the keepalive timer.
1116 */
1117
1118 if (sk->keepopen)
1119 {
1120 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
1121 tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1122 }
1123 return 1;
1124 }
1125
1126
1127 /*
1128 * Process the FIN bit. This now behaves as it is supposed to work
1129 * and the FIN takes effect when it is validly part of sequence
1130 * space. Not before when we get holes.
1131 *
1132 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
1133 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
1134 * TIME-WAIT)
1135 *
1136 * If we are in FINWAIT-1, a received FIN indicates simultaneous
1137 * close and we go into CLOSING (and later onto TIME-WAIT)
1138 *
1139 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
1140 *
1141 */
1142
1143 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1144 {
1145 sk->fin_seq = skb->end_seq;
1146
1147 if (!sk->dead)
1148 {
1149 sk->state_change(sk);
1150 sock_wake_async(sk->socket, 1);
1151 }
1152
1153 switch(sk->state)
1154 {
1155 case TCP_SYN_RECV:
1156 case TCP_SYN_SENT:
1157 case TCP_ESTABLISHED:
1158 /*
1159 * move to CLOSE_WAIT, tcp_data() already handled
1160 * sending the ack.
1161 */
1162 tcp_set_state(sk,TCP_CLOSE_WAIT);
1163 if (th->rst)
1164 sk->shutdown = SHUTDOWN_MASK;
1165 break;
1166
1167 case TCP_CLOSE_WAIT:
1168 case TCP_CLOSING:
1169 /*
1170 * received a retransmission of the FIN, do
1171 * nothing.
1172 */
1173 break;
1174 case TCP_TIME_WAIT:
1175 /*
1176 * received a retransmission of the FIN,
1177 * restart the TIME_WAIT timer.
1178 */
1179 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1180 return(0);
1181 case TCP_FIN_WAIT1:
1182 /*
1183 * This case occurs when a simultaneous close
1184 * happens, we must ack the received FIN and
1185 * enter the CLOSING state.
1186 *
1187 * This causes a WRITE timeout, which will either
1188 * move on to TIME_WAIT when we timeout, or resend
1189 * the FIN properly (maybe we get rid of that annoying
1190 * FIN lost hang). The TIME_WRITE code is already correct
1191 * for handling this timeout.
1192 */
1193
1194 if(sk->ip_xmit_timeout != TIME_WRITE)
1195 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1196 tcp_set_state(sk,TCP_CLOSING);
1197 break;
1198 case TCP_FIN_WAIT2:
1199 /*
1200 * received a FIN -- send ACK and enter TIME_WAIT
1201 */
1202 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1203 sk->shutdown|=SHUTDOWN_MASK;
1204 tcp_set_state(sk,TCP_TIME_WAIT);
1205 break;
1206 case TCP_CLOSE:
1207 /*
1208 * already in CLOSE
1209 */
1210 break;
1211 default:
1212 tcp_set_state(sk,TCP_LAST_ACK);
1213
1214 /* Start the timers. */
1215 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
1216 return(0);
1217 }
1218
1219 return(0);
1220 }
1221
1222 /*
1223 * Add a sk_buff to the TCP receive queue, calculating
1224 * the ACK sequence as we go..
1225 */
1226 static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1227 {
1228 struct sk_buff * prev, * next;
1229 u32 seq;
1230
1231 /*
1232 * Find where the new skb goes.. (This goes backwards,
1233 * on the assumption that we get the packets in order)
1234 */
1235 seq = skb->seq;
1236 prev = list->prev;
1237 next = (struct sk_buff *) list;
1238 for (;;) {
1239 if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
1240 break;
1241 next = prev;
1242 prev = prev->prev;
1243 }
1244 __skb_insert(skb, prev, next, list);
1245 }
1246
1247 /*
1248 * Called for each packet when we find a new ACK endpoint sequence in it
1249 */
1250 static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1251 {
1252 /*
1253 * When we ack the fin, we do the FIN
1254 * processing.
1255 */
1256 skb->acked = 1;
1257 if (skb->h.th->fin)
1258 tcp_fin(skb,sk,skb->h.th);
1259 return skb->end_seq;
1260 }
1261
1262 static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1263 {
1264 u32 ack_seq;
1265
1266 tcp_insert_skb(skb, &sk->receive_queue);
1267
1268 /*
1269 * Did we get anything new to ack?
1270 */
1271 ack_seq = sk->acked_seq;
1272
1273
1274 if (!after(skb->seq, ack_seq)) {
1275 if (after(skb->end_seq, ack_seq)) {
1276 /* the packet straddles our window end */
1277 struct sk_buff_head * list = &sk->receive_queue;
1278 struct sk_buff * next;
1279 ack_seq = tcp_queue_ack(skb, sk);
1280
1281 /*
1282 * Do we have any old packets to ack that the above
1283 * made visible? (Go forward from skb)
1284 */
1285 next = skb->next;
1286 while (next != (struct sk_buff *) list) {
1287 if (after(next->seq, ack_seq))
1288 break;
1289 if (after(next->end_seq, ack_seq))
1290 ack_seq = tcp_queue_ack(next, sk);
1291 next = next->next;
1292 }
1293
1294 /*
1295 * Ok, we found new data, update acked_seq as
1296 * necessary (and possibly send the actual
1297 * ACK packet).
1298 */
1299 sk->acked_seq = ack_seq;
1300
1301 } else {
1302 if (sk->debug)
1303 printk("Ack duplicate packet.\n");
1304 tcp_send_ack(sk);
1305 return;
1306 }
1307
1308
1309 /*
1310 * Delay the ack if possible. Send ack's to
1311 * fin frames immediately as there shouldn't be
1312 * anything more to come.
1313 */
1314 if (!sk->delay_acks || th->fin) {
1315 tcp_send_ack(sk);
1316 } else {
1317 /*
1318 * If psh is set we assume it's an
1319 * interactive session that wants quick
1320 * acks to avoid nagling too much.
1321 */
1322 int delay = HZ/2;
1323 if (th->psh)
1324 delay = HZ/50;
1325 tcp_send_delayed_ack(sk, delay);
1326 }
1327
1328 /*
1329 * Tell the user we have some more data.
1330 */
1331
1332 if (!sk->dead)
1333 sk->data_ready(sk,0);
1334
1335 }
1336 else
1337 {
1338 /*
1339 * If we've missed a packet, send an ack.
1340 * Also start a timer to send another.
1341 *
1342 * 4.3reno machines look for these kind of acks so
1343 * they can do fast recovery. Three identical 'old'
1344 * acks lets it know that one frame has been lost
1345 * and should be resent. Because this is before the
1346 * whole window of data has timed out it can take
1347 * one lost frame per window without stalling.
1348 * [See Jacobson RFC1323, Stevens TCP/IP illus vol2]
1349 *
1350 * We also should be spotting triple bad sequences.
1351 * [We now do this.]
1352 *
1353 */
1354
1355 if (!skb->acked)
1356 {
1357 if(sk->debug)
1358 printk("Ack past end of seq packet.\n");
1359 tcp_send_ack(sk);
1360 tcp_send_delayed_ack(sk,HZ/2);
1361 }
1362 }
1363 }
1364
1365
1366 /*
1367 * This routine handles the data. If there is room in the buffer,
1368 * it will be have already been moved into it. If there is no
1369 * room, then we will just have to discard the packet.
1370 */
1371
1372 static int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1373 unsigned long saddr, unsigned int len)
1374 {
1375 struct tcphdr *th;
1376 u32 new_seq, shut_seq;
1377
1378 th = skb->h.th;
1379 skb_pull(skb,th->doff*4);
1380 skb_trim(skb,len-(th->doff*4));
1381
1382 /*
1383 * The bytes in the receive read/assembly queue has increased. Needed for the
1384 * low memory discard algorithm
1385 */
1386
1387 sk->bytes_rcv += skb->len;
1388
1389 if (skb->len == 0 && !th->fin)
1390 {
1391 /*
1392 * Don't want to keep passing ack's back and forth.
1393 * (someone sent us dataless, boring frame)
1394 */
1395 if (!th->ack)
1396 tcp_send_ack(sk);
1397 kfree_skb(skb, FREE_READ);
1398 return(0);
1399 }
1400
1401 /*
1402 * We no longer have anyone receiving data on this connection.
1403 */
1404
1405 #ifndef TCP_DONT_RST_SHUTDOWN
1406
1407 if(sk->shutdown & RCV_SHUTDOWN)
1408 {
1409 /*
1410 * FIXME: BSD has some magic to avoid sending resets to
1411 * broken 4.2 BSD keepalives. Much to my surprise a few non
1412 * BSD stacks still have broken keepalives so we want to
1413 * cope with it.
1414 */
1415
1416 if(skb->len) /* We don't care if it's just an ack or
1417 a keepalive/window probe */
1418 {
1419 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
1420
1421 /* Do this the way 4.4BSD treats it. Not what I'd
1422 regard as the meaning of the spec but it's what BSD
1423 does and clearly they know everything 8) */
1424
1425 /*
1426 * This is valid because of two things
1427 *
1428 * a) The way tcp_data behaves at the bottom.
1429 * b) A fin takes effect when read not when received.
1430 */
1431
1432 shut_seq = sk->acked_seq+1; /* Last byte */
1433
1434 if(after(new_seq,shut_seq))
1435 {
1436 if(sk->debug)
1437 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
1438 sk, new_seq, shut_seq, sk->blog);
1439 if(sk->dead)
1440 {
1441 sk->acked_seq = new_seq + th->fin;
1442 tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
1443 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
1444 tcp_statistics.TcpEstabResets++;
1445 sk->err = EPIPE;
1446 sk->error_report(sk);
1447 sk->shutdown = SHUTDOWN_MASK;
1448 tcp_set_state(sk,TCP_CLOSE);
1449 kfree_skb(skb, FREE_READ);
1450 return 0;
1451 }
1452 }
1453 }
1454 }
1455
1456 #endif
1457
1458 tcp_queue(skb, sk, th);
1459
1460 return(0);
1461 }
1462
1463
1464 /*
1465 * This routine is only called when we have urgent data
1466 * signalled. Its the 'slow' part of tcp_urg. It could be
1467 * moved inline now as tcp_urg is only called from one
1468 * place. We handle URGent data wrong. We have to - as
1469 * BSD still doesn't use the correction from RFC961.
1470 */
1471
1472 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1473 {
1474 u32 ptr = ntohs(th->urg_ptr);
1475
1476 if (ptr)
1477 ptr--;
1478 ptr += ntohl(th->seq);
1479
1480 /* ignore urgent data that we've already seen and read */
1481 if (after(sk->copied_seq, ptr))
1482 return;
1483
1484 /* do we already have a newer (or duplicate) urgent pointer? */
1485 if (sk->urg_data && !after(ptr, sk->urg_seq))
1486 return;
1487
1488 /* tell the world about our new urgent pointer */
1489 if (sk->proc != 0) {
1490 if (sk->proc > 0) {
1491 kill_proc(sk->proc, SIGURG, 1);
1492 } else {
1493 kill_pg(-sk->proc, SIGURG, 1);
1494 }
1495 }
1496 sk->urg_data = URG_NOTYET;
1497 sk->urg_seq = ptr;
1498 }
1499
1500 /*
1501 * This is the 'fast' part of urgent handling.
1502 */
1503
1504 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1505 {
1506 /*
1507 * Check if we get a new urgent pointer - normally not
1508 */
1509
1510 if (th->urg)
1511 tcp_check_urg(sk,th);
1512
1513 /*
1514 * Do we wait for any urgent data? - normally not
1515 */
1516
1517 if (sk->urg_data == URG_NOTYET) {
1518 u32 ptr;
1519
1520 /*
1521 * Is the urgent pointer pointing into this packet?
1522 */
1523 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
1524 if (ptr < len) {
1525 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
1526 if (!sk->dead)
1527 sk->data_ready(sk,0);
1528 }
1529 }
1530 }
1531
1532 /*
1533 * This should be a bit smarter and remove partially
1534 * overlapping stuff too, but this should be good
1535 * enough for any even remotely normal case (and the
1536 * worst that can happen is that we have a few
1537 * unnecessary packets in the receive queue).
1538 *
1539 * This function is never called with an empty list..
1540 */
1541 static inline void tcp_remove_dups(struct sk_buff_head * list)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1542 {
1543 struct sk_buff * next = list->next;
1544
1545 for (;;) {
1546 struct sk_buff * skb = next;
1547 next = next->next;
1548 if (next == (struct sk_buff *) list)
1549 break;
1550 if (before(next->end_seq, skb->end_seq)) {
1551 __skb_unlink(next, list);
1552 kfree_skb(next, FREE_READ);
1553 next = skb;
1554 continue;
1555 }
1556 if (next->seq != skb->seq)
1557 continue;
1558 __skb_unlink(skb, list);
1559 kfree_skb(skb, FREE_READ);
1560 }
1561 }
1562
1563 /*
1564 * Throw out all unnecessary packets: we've gone over the
1565 * receive queue limit. This shouldn't happen in a normal
1566 * TCP connection, but we might have gotten duplicates etc.
1567 */
1568 static void prune_queue(struct sk_buff_head * list)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1569 {
1570 for (;;) {
1571 struct sk_buff * skb = list->prev;
1572
1573 /* gone through it all? */
1574 if (skb == (struct sk_buff *) list)
1575 break;
1576 if (!skb->acked) {
1577 __skb_unlink(skb, list);
1578 kfree_skb(skb, FREE_READ);
1579 continue;
1580 }
1581 tcp_remove_dups(list);
1582 break;
1583 }
1584 }
1585
1586 /*
1587 * A TCP packet has arrived.
1588 * skb->h.raw is the TCP header.
1589 */
1590
1591 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1592 __u32 daddr, unsigned short len,
1593 __u32 saddr, int redo, struct inet_protocol * protocol)
1594 {
1595 struct tcphdr *th;
1596 struct sock *sk;
1597 int syn_ok=0;
1598
1599 /*
1600 * "redo" is 1 if we have already seen this skb but couldn't
1601 * use it at that time (the socket was locked). In that case
1602 * we have already done a lot of the work (looked up the socket
1603 * etc).
1604 */
1605 th = skb->h.th;
1606 sk = skb->sk;
1607 if (!redo) {
1608 tcp_statistics.TcpInSegs++;
1609 if (skb->pkt_type!=PACKET_HOST)
1610 goto discard_it;
1611
1612 /*
1613 * Pull up the IP header.
1614 */
1615
1616 skb_pull(skb, skb->h.raw-skb->data);
1617
1618 /*
1619 * Try to use the device checksum if provided.
1620 */
1621 switch (skb->ip_summed)
1622 {
1623 case CHECKSUM_NONE:
1624 skb->csum = csum_partial((char *)th, len, 0);
1625 case CHECKSUM_HW:
1626 if (tcp_check(th, len, saddr, daddr, skb->csum))
1627 goto discard_it;
1628 default:
1629 /* CHECKSUM_UNNECESSARY */
1630 }
1631 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
1632 if (!sk)
1633 goto no_tcp_socket;
1634 skb->sk = sk;
1635 skb->seq = ntohl(th->seq);
1636 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
1637 skb->ack_seq = ntohl(th->ack_seq);
1638
1639 skb->acked = 0;
1640 skb->used = 0;
1641 skb->free = 1;
1642 skb->saddr = daddr;
1643 skb->daddr = saddr;
1644
1645 /*
1646 * We may need to add it to the backlog here.
1647 */
1648 if (sk->users)
1649 {
1650 __skb_queue_tail(&sk->back_log, skb);
1651 return(0);
1652 }
1653 }
1654
1655 /*
1656 * If this socket has got a reset it's to all intents and purposes
1657 * really dead. Count closed sockets as dead.
1658 *
1659 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
1660 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
1661 * exist so should cause resets as if the port was unreachable.
1662 */
1663
1664 if (sk->zapped || sk->state==TCP_CLOSE)
1665 goto no_tcp_socket;
1666
1667 if (!sk->prot)
1668 {
1669 printk("IMPOSSIBLE 3\n");
1670 return(0);
1671 }
1672
1673
1674 /*
1675 * Charge the memory to the socket.
1676 */
1677
1678 skb->sk=sk;
1679 atomic_add(skb->truesize, &sk->rmem_alloc);
1680
1681 /*
1682 * We should now do header prediction.
1683 */
1684
1685 /*
1686 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
1687 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
1688 * compatibility. We also set up variables more thoroughly [Karn notes in the
1689 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
1690 */
1691
1692 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
1693 {
1694
1695 /*
1696 * Now deal with unusual cases.
1697 */
1698
1699 if(sk->state==TCP_LISTEN)
1700 {
1701 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
1702 tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
1703
1704 /*
1705 * We don't care for RST, and non SYN are absorbed (old segments)
1706 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
1707 * netmask on a running connection it can go broadcast. Even Sun's have
1708 * this problem so I'm ignoring it
1709 */
1710
1711 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
1712 {
1713 kfree_skb(skb, FREE_READ);
1714 return 0;
1715 }
1716
1717 /*
1718 * Guess we need to make a new socket up
1719 */
1720
1721 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
1722
1723 /*
1724 * Now we have several options: In theory there is nothing else
1725 * in the frame. KA9Q has an option to send data with the syn,
1726 * BSD accepts data with the syn up to the [to be] advertised window
1727 * and Solaris 2.1 gives you a protocol error. For now we just ignore
1728 * it, that fits the spec precisely and avoids incompatibilities. It
1729 * would be nice in future to drop through and process the data.
1730 *
1731 * Now TTCP is starting to use we ought to queue this data.
1732 */
1733
1734 return 0;
1735 }
1736
1737 /*
1738 * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
1739 * then its a new connection
1740 */
1741
1742 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
1743 {
1744 kfree_skb(skb, FREE_READ);
1745 return 0;
1746 }
1747
1748 /*
1749 * SYN sent means we have to look for a suitable ack and either reset
1750 * for bad matches or go to connected. The SYN_SENT case is unusual and should
1751 * not be in line code. [AC]
1752 */
1753
1754 if(sk->state==TCP_SYN_SENT)
1755 {
1756 /* Crossed SYN or previous junk segment */
1757 if(th->ack)
1758 {
1759 /* We got an ack, but it's not a good ack */
1760 if(!tcp_ack(sk,th,skb->ack_seq,len))
1761 {
1762 /* Reset the ack - its an ack from a
1763 different connection [ th->rst is checked in tcp_send_reset()] */
1764 tcp_statistics.TcpAttemptFails++;
1765 tcp_send_reset(daddr, saddr, th,
1766 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1767 kfree_skb(skb, FREE_READ);
1768 return(0);
1769 }
1770 if(th->rst)
1771 return tcp_reset(sk,skb);
1772 if(!th->syn)
1773 {
1774 /* A valid ack from a different connection
1775 start. Shouldn't happen but cover it */
1776 tcp_statistics.TcpAttemptFails++;
1777 tcp_send_reset(daddr, saddr, th,
1778 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
1779 kfree_skb(skb, FREE_READ);
1780 return 0;
1781 }
1782 /*
1783 * Ok.. it's good. Set up sequence numbers and
1784 * move to established.
1785 */
1786 syn_ok=1; /* Don't reset this connection for the syn */
1787 sk->acked_seq = skb->seq+1;
1788 sk->lastwin_seq = skb->seq+1;
1789 sk->fin_seq = skb->seq;
1790 tcp_send_ack(sk);
1791 tcp_set_state(sk, TCP_ESTABLISHED);
1792 tcp_options(sk,th);
1793 sk->dummy_th.dest=th->source;
1794 sk->copied_seq = sk->acked_seq;
1795 if(!sk->dead)
1796 {
1797 sk->state_change(sk);
1798 sock_wake_async(sk->socket, 0);
1799 }
1800 if(sk->max_window==0)
1801 {
1802 sk->max_window = 32;
1803 sk->mss = min(sk->max_window, sk->mtu);
1804 }
1805 }
1806 else
1807 {
1808 /* See if SYN's cross. Drop if boring */
1809 if(th->syn && !th->rst)
1810 {
1811 /* Crossed SYN's are fine - but talking to
1812 yourself is right out... */
1813 if(sk->saddr==saddr && sk->daddr==daddr &&
1814 sk->dummy_th.source==th->source &&
1815 sk->dummy_th.dest==th->dest)
1816 {
1817 tcp_statistics.TcpAttemptFails++;
1818 return tcp_reset(sk,skb);
1819 }
1820 tcp_set_state(sk,TCP_SYN_RECV);
1821
1822 /*
1823 * FIXME:
1824 * Must send SYN|ACK here
1825 */
1826 }
1827 /* Discard junk segment */
1828 kfree_skb(skb, FREE_READ);
1829 return 0;
1830 }
1831 /*
1832 * SYN_RECV with data maybe.. drop through
1833 */
1834 goto rfc_step6;
1835 }
1836
1837 /*
1838 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
1839 * a more complex suggestion for fixing these reuse issues in RFC1644
1840 * but not yet ready for general use. Also see RFC1379.
1841 *
1842 * Note the funny way we go back to the top of this function for
1843 * this case ("goto try_next_socket"). That also takes care of
1844 * checking "sk->users" for the new socket as well as doing all
1845 * the normal tests on the packet.
1846 */
1847
1848 #define BSD_TIME_WAIT
1849 #ifdef BSD_TIME_WAIT
1850 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
1851 after(skb->seq, sk->acked_seq) && !th->rst)
1852 {
1853 u32 seq = sk->write_seq;
1854 if(sk->debug)
1855 printk("Doing a BSD time wait\n");
1856 tcp_statistics.TcpEstabResets++;
1857 atomic_sub(skb->truesize, &sk->rmem_alloc);
1858 skb->sk = NULL;
1859 sk->err=ECONNRESET;
1860 tcp_set_state(sk, TCP_CLOSE);
1861 sk->shutdown = SHUTDOWN_MASK;
1862 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
1863 /* this is not really correct: we should check sk->users */
1864 if (sk && sk->state==TCP_LISTEN)
1865 {
1866 skb->sk = sk;
1867 atomic_add(skb->truesize, &sk->rmem_alloc);
1868 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
1869 return 0;
1870 }
1871 kfree_skb(skb, FREE_READ);
1872 return 0;
1873 }
1874 #endif
1875 }
1876
1877 /*
1878 * We are now in normal data flow (see the step list in the RFC)
1879 * Note most of these are inline now. I'll inline the lot when
1880 * I have time to test it hard and look at what gcc outputs
1881 */
1882
1883 if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
1884 {
1885 bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
1886 kfree_skb(skb, FREE_READ);
1887 return 0;
1888 }
1889
1890 if(th->rst)
1891 return tcp_reset(sk,skb);
1892
1893 /*
1894 * !syn_ok is effectively the state test in RFC793.
1895 */
1896
1897 if(th->syn && !syn_ok)
1898 {
1899 tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
1900 return tcp_reset(sk,skb);
1901 }
1902
1903 tcp_delack_estimator(sk);
1904
1905 /*
1906 * Process the ACK
1907 */
1908
1909
1910 if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
1911 {
1912 /*
1913 * Our three way handshake failed.
1914 */
1915
1916 if(sk->state==TCP_SYN_RECV)
1917 {
1918 tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
1919 }
1920 kfree_skb(skb, FREE_READ);
1921 return 0;
1922 }
1923
1924 rfc_step6: /* I'll clean this up later */
1925
1926 /*
1927 * If the accepted buffer put us over our queue size we
1928 * now drop it (we must process the ack first to avoid
1929 * deadlock cases).
1930 */
1931
1932 /*
1933 * Process urgent data
1934 */
1935
1936 tcp_urg(sk, th, len);
1937
1938 /*
1939 * Process the encapsulated data
1940 */
1941
1942 if(tcp_data(skb,sk, saddr, len))
1943 kfree_skb(skb, FREE_READ);
1944
1945 /*
1946 * If our receive queue has grown past its limits,
1947 * try to prune away duplicates etc..
1948 */
1949 if (sk->rmem_alloc > sk->rcvbuf)
1950 prune_queue(&sk->receive_queue);
1951
1952 /*
1953 * And done
1954 */
1955
1956 return 0;
1957
1958 no_tcp_socket:
1959 /*
1960 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
1961 */
1962 tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
1963
1964 discard_it:
1965 /*
1966 * Discard frame
1967 */
1968 skb->sk = NULL;
1969 kfree_skb(skb, FREE_READ);
1970 return 0;
1971 }