1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1
26 * and was trying to connect (tcp_err()).
27 * Alan Cox : All icmp error handling was broken
28 * pointers passed where wrong and the
29 * socket was looked up backwards. Nobody
30 * tested any icmp error code obviously.
31 * Alan Cox : tcp_err() now handled properly. It wakes people
32 * on errors. select behaves and the icmp error race
33 * has gone by moving it into sock.c
34 * Alan Cox : tcp_reset() fixed to work for everything not just
35 * packets for unknown sockets.
36 * Alan Cox : tcp option processing.
37 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong]
38 * Herp Rosmanith : More reset fixes
39 * Alan Cox : No longer acks invalid rst frames. Acking
40 * any kind of RST is right out.
41 * Alan Cox : Sets an ignore me flag on an rst receive
42 * otherwise odd bits of prattle escape still
43 * Alan Cox : Fixed another acking RST frame bug. Should stop
44 * LAN workplace lockups.
45 * Alan Cox : Some tidyups using the new skb list facilities
46 * Alan Cox : sk->keepopen now seems to work
47 * Alan Cox : Pulls options out correctly on accepts
48 * Alan Cox : Fixed assorted sk->rqueue->next errors
49 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops.
50 * Alan Cox : Tidied tcp_data to avoid a potential nasty.
51 * Alan Cox : Added some better commenting, as the tcp is hard to follow
52 * Alan Cox : Removed incorrect check for 20 * psh
53 * Michael O'Reilly : ack < copied bug fix.
54 * Johannes Stille : Misc tcp fixes (not all in yet).
55 * Alan Cox : FIN with no memory -> CRASH
56 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept.
57 * Alan Cox : Added TCP options (SOL_TCP)
58 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets.
59 * Alan Cox : Use ip_tos/ip_ttl settings.
60 * Alan Cox : Handle FIN (more) properly (we hope).
61 * Alan Cox : RST frames sent on unsynchronised state ack error/
62 * Alan Cox : Put in missing check for SYN bit.
63 * Alan Cox : Added tcp_select_window() aka NET2E
64 * window non shrink trick.
65 * Alan Cox : Added a couple of small NET2E timer fixes
66 * Charles Hedrick : TCP fixes
67 * Toomas Tamm : TCP window fixes
68 * Alan Cox : Small URG fix to rlogin ^C ack fight
69 * Charles Hedrick : Rewrote most of it to actually work
70 * Linus : Rewrote tcp_read() and URG handling
71 * completely
72 * Gerhard Koerting: Fixed some missing timer handling
73 * Matthew Dillon : Reworked TCP machine states as per RFC
74 * Gerhard Koerting: PC/TCP workarounds
75 * Adam Caldwell : Assorted timer/timing errors
76 * Matthew Dillon : Fixed another RST bug
77 * Alan Cox : Move to kernel side addressing changes.
78 * Alan Cox : Beginning work on TCP fastpathing (not yet usable)
79 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
80 * Alan Cox : TCP fast path debugging
81 * Alan Cox : Window clamping
82 * Michael Riepe : Bug in tcp_check()
83 * Matt Dillon : More TCP improvements and RST bug fixes
84 * Matt Dillon : Yet more small nasties remove from the TCP code
85 * (Be very nice to this man if tcp finally works 100%) 8)
86 * Alan Cox : BSD accept semantics.
87 * Alan Cox : Reset on closedown bug.
88 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
89 * Michael Pall : Handle select() after URG properly in all cases.
90 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
91 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
92 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api.
93 * Alan Cox : Changed the semantics of sk->socket to
94 * fix a race and a signal problem with
95 * accept() and async I/O.
96 * Alan Cox : Relaxed the rules on tcp_sendto().
97 * Yury Shevchuk : Really fixed accept() blocking problem.
98 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
99 * clients/servers which listen in on
100 * fixed ports.
101 * Alan Cox : Cleaned the above up and shrank it to
102 * a sensible code size.
103 * Alan Cox : Self connect lockup fix.
104 * Alan Cox : No connect to multicast.
105 * Ross Biro : Close unaccepted children on master
106 * socket close.
107 * Alan Cox : Reset tracing code.
108 * Alan Cox : Spurious resets on shutdown.
109 * Alan Cox : Giant 15 minute/60 second timer error
110 * Alan Cox : Small whoops in selecting before an accept.
111 * Alan Cox : Kept the state trace facility since it's
112 * handy for debugging.
113 * Alan Cox : More reset handler fixes.
114 * Alan Cox : Started rewriting the code based on the RFC's
115 * for other useful protocol references see:
116 * Comer, KA9Q NOS, and for a reference on the
117 * difference between specifications and how BSD
118 * works see the 4.4lite source.
119 * A.N.Kuznetsov : Don't time wait on completion of tidy
120 * close.
121 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
122 * Linus Torvalds : Fixed BSD port reuse to work first syn
123 * Alan Cox : Reimplemented timers as per the RFC and using multiple
124 * timers for sanity.
125 * Alan Cox : Small bug fixes, and a lot of new
126 * comments.
127 * Alan Cox : Fixed dual reader crash by locking
128 * the buffers (much like datagram.c)
129 * Alan Cox : Fixed stuck sockets in probe. A probe
130 * now gets fed up of retrying without
131 * (even a no space) answer.
132 * Alan Cox : Extracted closing code better
133 * Alan Cox : Fixed the closing state machine to
134 * resemble the RFC.
135 * Alan Cox : More 'per spec' fixes.
136 * Jorge Cwik : Even faster checksumming.
137 * Alan Cox : tcp_data() doesn't ack illegal PSH
138 * only frames. At least one pc tcp stack
139 * generates them.
140 * Alan Cox : Cache last socket.
141 * Alan Cox : Per route irtt.
142 * Matt Day : Select() match BSD precisely on error
143 * Alan Cox : New buffers
144 * Mark Tamsky : Various sk->prot->retransmits and
145 * sk->retransmits misupdating fixed.
146 * Fixed tcp_write_timeout: stuck close,
147 * and TCP syn retries gets used now.
148 * Mark Yarvis : In tcp_read_wakeup(), don't send an
149 * ack if stat is TCP_CLOSED.
150 * Alan Cox : Look up device on a retransmit - routes may
151 * change. Doesn't yet cope with MSS shrink right
152 * but its a start!
153 *
154 *
155 * To Fix:
156 * Fast path the code. Two things here - fix the window calculation
157 * so it doesn't iterate over the queue, also spot packets with no funny
158 * options arriving in order and process directly.
159 *
160 * Implement RFC 1191 [Path MTU discovery]
161 * Look at the effect of implementing RFC 1337 suggestions and their impact.
162 * Rewrite output state machine to use a single queue and do low window
163 * situations as per the spec (RFC 1122)
164 * Speed up input assembly algorithm.
165 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
166 * could do with it working on IPv4
167 * User settable/learned rtt/max window/mtu
168 * Cope with MTU/device switches when retransmitting in tcp.
169 * Fix the window handling to use PR's new code.
170 *
171 * Change the fundamental structure to a single send queue maintained
172 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
173 * active routes too]). Cut the queue off in tcp_retransmit/
174 * tcp_transmit.
175 * Change the receive queue to assemble as it goes. This lets us
176 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
177 * tcp_data/tcp_read as well as the window shrink crud.
178 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
179 * tcp_queue_skb seem obvious routines to extract.
180 *
181 * This program is free software; you can redistribute it and/or
182 * modify it under the terms of the GNU General Public License
183 * as published by the Free Software Foundation; either version
184 * 2 of the License, or(at your option) any later version.
185 *
186 * Description of States:
187 *
188 * TCP_SYN_SENT sent a connection request, waiting for ack
189 *
190 * TCP_SYN_RECV received a connection request, sent ack,
191 * waiting for final ack in three-way handshake.
192 *
193 * TCP_ESTABLISHED connection established
194 *
195 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
196 * transmission of remaining buffered data
197 *
198 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
199 * to shutdown
200 *
201 * TCP_CLOSING both sides have shutdown but we still have
202 * data we have to finish sending
203 *
204 * TCP_TIME_WAIT timeout to catch resent junk before entering
205 * closed, can only be entered from FIN_WAIT2
206 * or CLOSING. Required because the other end
207 * may not have gotten our last ACK causing it
208 * to retransmit the data packet (which we ignore)
209 *
210 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
211 * us to finish writing our data and to shutdown
212 * (we have to close() to move on to LAST_ACK)
213 *
214 * TCP_LAST_ACK out side has shutdown after remote has
215 * shutdown. There may still be data in our
216 * buffer that we have to finish sending
217 *
218 * TCP_CLOSE socket is finished
219 */
220
221 #include <linux/types.h>
222 #include <linux/sched.h>
223 #include <linux/mm.h>
224 #include <linux/time.h>
225 #include <linux/string.h>
226 #include <linux/config.h>
227 #include <linux/socket.h>
228 #include <linux/sockios.h>
229 #include <linux/termios.h>
230 #include <linux/in.h>
231 #include <linux/fcntl.h>
232 #include <linux/inet.h>
233 #include <linux/netdevice.h>
234 #include <net/snmp.h>
235 #include <net/ip.h>
236 #include <net/protocol.h>
237 #include <net/icmp.h>
238 #include <net/tcp.h>
239 #include <net/arp.h>
240 #include <linux/skbuff.h>
241 #include <net/sock.h>
242 #include <net/route.h>
243 #include <linux/errno.h>
244 #include <linux/timer.h>
245 #include <asm/system.h>
246 #include <asm/segment.h>
247 #include <linux/mm.h>
248 #include <net/checksum.h>
249
250 /*
251 * The MSL timer is the 'normal' timer.
252 */
253
254 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
255
256 #define SEQ_TICK 3
257 unsigned long seq_offset;
258 struct tcp_mib tcp_statistics;
259
260 /*
261 * Cached last hit socket
262 */
263
264 volatile unsigned long th_cache_saddr,th_cache_daddr;
265 volatile unsigned short th_cache_dport, th_cache_sport;
266 volatile struct sock *th_cache_sk;
267
268 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
269 {
270 unsigned long flags;
271 save_flags(flags);
272 cli();
273 th_cache_saddr=0;
274 th_cache_daddr=0;
275 th_cache_dport=0;
276 th_cache_sport=0;
277 th_cache_sk=NULL;
278 restore_flags(flags);
279 }
280
281 static void tcp_close(struct sock *sk, int timeout);
282
283
284 /*
285 * The less said about this the better, but it works and will do for 1.2
286 */
287
288 static struct wait_queue *master_select_wakeup;
289
290 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
291 {
292 if (a < b)
293 return(a);
294 return(b);
295 }
296
297 #undef STATE_TRACE
298
299 #ifdef STATE_TRACE
300 static char *statename[]={
301 "Unused","Established","Syn Sent","Syn Recv",
302 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
303 "Close Wait","Last ACK","Listen","Closing"
304 };
305 #endif
306
307 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
308 {
309 if(sk->state==TCP_ESTABLISHED)
310 tcp_statistics.TcpCurrEstab--;
311 #ifdef STATE_TRACE
312 if(sk->debug)
313 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
314 #endif
315 /* This is a hack but it doesn't occur often and it's going to
316 be a real to fix nicely */
317
318 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
319 {
320 wake_up_interruptible(&master_select_wakeup);
321 }
322 sk->state=state;
323 if(state==TCP_ESTABLISHED)
324 tcp_statistics.TcpCurrEstab++;
325 }
326
327 /*
328 * This routine picks a TCP windows for a socket based on
329 * the following constraints
330 *
331 * 1. The window can never be shrunk once it is offered (RFC 793)
332 * 2. We limit memory per socket
333 *
334 * For now we use NET2E3's heuristic of offering half the memory
335 * we have handy. All is not as bad as this seems however because
336 * of two things. Firstly we will bin packets even within the window
337 * in order to get the data we are waiting for into the memory limit.
338 * Secondly we bin common duplicate forms at receive time
339 * Better heuristics welcome
340 */
341
342 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
343 {
344 int new_window = sk->prot->rspace(sk);
345
346 if(sk->window_clamp)
347 new_window=min(sk->window_clamp,new_window);
348 /*
349 * Two things are going on here. First, we don't ever offer a
350 * window less than min(sk->mss, MAX_WINDOW/2). This is the
351 * receiver side of SWS as specified in RFC1122.
352 * Second, we always give them at least the window they
353 * had before, in order to avoid retracting window. This
354 * is technically allowed, but RFC1122 advises against it and
355 * in practice it causes trouble.
356 *
357 * Fixme: This doesn't correctly handle the case where
358 * new_window > sk->window but not by enough to allow for the
359 * shift in sequence space.
360 */
361 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
362 return(sk->window);
363 return(new_window);
364 }
365
366 /*
367 * Find someone to 'accept'. Must be called with
368 * sk->inuse=1 or cli()
369 */
370
371 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
372 {
373 struct sk_buff *p=skb_peek(&s->receive_queue);
374 if(p==NULL)
375 return NULL;
376 do
377 {
378 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
379 return p;
380 p=p->next;
381 }
382 while(p!=(struct sk_buff *)&s->receive_queue);
383 return NULL;
384 }
385
386 /*
387 * Remove a completed connection and return it. This is used by
388 * tcp_accept() to get connections from the queue.
389 */
390
391 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
392 {
393 struct sk_buff *skb;
394 unsigned long flags;
395 save_flags(flags);
396 cli();
397 skb=tcp_find_established(s);
398 if(skb!=NULL)
399 skb_unlink(skb); /* Take it off the queue */
400 restore_flags(flags);
401 return skb;
402 }
403
404 /*
405 * This routine closes sockets which have been at least partially
406 * opened, but not yet accepted. Currently it is only called by
407 * tcp_close, and timeout mirrors the value there.
408 */
409
410 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
411 {
412 struct sk_buff *skb;
413
414 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
415 {
416 skb->sk->dead=1;
417 tcp_close(skb->sk, 0);
418 kfree_skb(skb, FREE_READ);
419 }
420 return;
421 }
422
423 /*
424 * Enter the time wait state.
425 */
426
427 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
428 {
429 tcp_set_state(sk,TCP_TIME_WAIT);
430 sk->shutdown = SHUTDOWN_MASK;
431 if (!sk->dead)
432 sk->state_change(sk);
433 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
434 }
435
436 /*
437 * A socket has timed out on its send queue and wants to do a
438 * little retransmitting. Currently this means TCP.
439 */
440
441 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
442 {
443 struct sk_buff * skb;
444 struct proto *prot;
445 struct device *dev;
446 int ct=0;
447 struct rtable *rt;
448
449 prot = sk->prot;
450 skb = sk->send_head;
451
452 while (skb != NULL)
453 {
454 struct tcphdr *th;
455 struct iphdr *iph;
456 int size;
457
458 dev = skb->dev;
459 IS_SKB(skb);
460 skb->when = jiffies;
461
462 /*
463 * Discard the surplus MAC header
464 */
465
466 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
467
468 /*
469 * In general it's OK just to use the old packet. However we
470 * need to use the current ack and window fields. Urg and
471 * urg_ptr could possibly stand to be updated as well, but we
472 * don't keep the necessary data. That shouldn't be a problem,
473 * if the other end is doing the right thing. Since we're
474 * changing the packet, we have to issue a new IP identifier.
475 */
476
477 iph = (struct iphdr *)skb->data;
478 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
479 size = ntohs(iph->tot_len) - (iph->ihl<<2);
480
481 /*
482 * Note: We ought to check for window limits here but
483 * currently this is done (less efficiently) elsewhere.
484 */
485
486 iph->id = htons(ip_id_count++);
487 ip_send_check(iph);
488
489 /*
490 * Put a MAC header back on (may cause ARPing)
491 */
492
493 if(skb->localroute)
494 rt=ip_rt_local(iph->daddr,NULL,NULL);
495 else
496 rt=ip_rt_route(iph->daddr,NULL,NULL);
497
498 if(rt==NULL) /* Deep poo */
499 {
500 if(skb->sk)
501 {
502 skb->sk->err=ENETUNREACH;
503 skb->sk->error_report(skb->sk);
504 }
505 }
506 else
507 {
508 dev=rt->rt_dev;
509 skb->raddr=rt->rt_gateway;
510 if(skb->raddr==0)
511 skb->raddr=iph->daddr;
512 skb->dev=dev;
513 skb->arp=1;
514 if(dev->hard_header)
515 {
516 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
517 skb->arp=0;
518 }
519
520 /*
521 * This is not the right way to handle this. We have to
522 * issue an up to date window and ack report with this
523 * retransmit to keep the odd buggy tcp that relies on
524 * the fact BSD does this happy.
525 * We don't however need to recalculate the entire
526 * checksum, so someone wanting a small problem to play
527 * with might like to implement RFC1141/RFC1624 and speed
528 * this up by avoiding a full checksum.
529 */
530
531 th->ack_seq = ntohl(sk->acked_seq);
532 th->window = ntohs(tcp_select_window(sk));
533 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
534
535 /*
536 * If the interface is (still) up and running, kick it.
537 */
538
539 if (dev->flags & IFF_UP)
540 {
541 /*
542 * If the packet is still being sent by the device/protocol
543 * below then don't retransmit. This is both needed, and good -
544 * especially with connected mode AX.25 where it stops resends
545 * occurring of an as yet unsent anyway frame!
546 * We still add up the counts as the round trip time wants
547 * adjusting.
548 */
549 if (sk && !skb_device_locked(skb))
550 {
551 /* Remove it from any existing driver queue first! */
552 skb_unlink(skb);
553 /* Now queue it */
554 ip_statistics.IpOutRequests++;
555 dev_queue_xmit(skb, dev, sk->priority);
556 }
557 }
558 }
559
560 /*
561 * Count retransmissions
562 */
563
564 ct++;
565 sk->prot->retransmits ++;
566 tcp_statistics.TcpRetransSegs++;
567
568
569 /*
570 * Only one retransmit requested.
571 */
572
573 if (!all)
574 break;
575
576 /*
577 * This should cut it off before we send too many packets.
578 */
579
580 if (ct >= sk->cong_window)
581 break;
582 skb = skb->link3;
583 }
584 }
585
586 /*
587 * Reset the retransmission timer
588 */
589
590 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
591 {
592 del_timer(&sk->retransmit_timer);
593 sk->ip_xmit_timeout = why;
594 if((int)when < 0)
595 {
596 when=3;
597 printk("Error: Negative timer in xmit_timer\n");
598 }
599 sk->retransmit_timer.expires=jiffies+when;
600 add_timer(&sk->retransmit_timer);
601 }
602
603 /*
604 * This is the normal code called for timeouts. It does the retransmission
605 * and then does backoff. tcp_do_retransmit is separated out because
606 * tcp_ack needs to send stuff from the retransmit queue without
607 * initiating a backoff.
608 */
609
610
611 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
612 {
613 tcp_do_retransmit(sk, all);
614
615 /*
616 * Increase the timeout each time we retransmit. Note that
617 * we do not increase the rtt estimate. rto is initialized
618 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
619 * that doubling rto each time is the least we can get away with.
620 * In KA9Q, Karn uses this for the first few times, and then
621 * goes to quadratic. netBSD doubles, but only goes up to *64,
622 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
623 * defined in the protocol as the maximum possible RTT. I guess
624 * we'll have to use something other than TCP to talk to the
625 * University of Mars.
626 *
627 * PAWS allows us longer timeouts and large windows, so once
628 * implemented ftp to mars will work nicely. We will have to fix
629 * the 120 second clamps though!
630 */
631
632 sk->retransmits++;
633 sk->prot->retransmits++;
634 sk->backoff++;
635 sk->rto = min(sk->rto << 1, 120*HZ);
636 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
637 }
638
639
640 /*
641 * A timer event has trigger a tcp retransmit timeout. The
642 * socket xmit queue is ready and set up to send. Because
643 * the ack receive code keeps the queue straight we do
644 * nothing clever here.
645 */
646
647 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
648 {
649 if (all)
650 {
651 tcp_retransmit_time(sk, all);
652 return;
653 }
654
655 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
656 /* sk->ssthresh in theory can be zero. I guess that's OK */
657 sk->cong_count = 0;
658
659 sk->cong_window = 1;
660
661 /* Do the actual retransmit. */
662 tcp_retransmit_time(sk, all);
663 }
664
665 /*
666 * A write timeout has occurred. Process the after effects.
667 */
668
669 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
670 {
671 /*
672 * Look for a 'soft' timeout.
673 */
674 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
675 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
676 {
677 /*
678 * Attempt to recover if arp has changed (unlikely!) or
679 * a route has shifted (not supported prior to 1.3).
680 */
681 arp_destroy (sk->daddr, 0);
682 /*ip_route_check (sk->daddr);*/
683 }
684
685 /*
686 * Have we tried to SYN too many times (repent repent 8))
687 */
688
689 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
690 {
691 sk->err=ETIMEDOUT;
692 sk->error_report(sk);
693 del_timer(&sk->retransmit_timer);
694 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
695 tcp_set_state(sk,TCP_CLOSE);
696 /* Don't FIN, we got nothing back */
697 release_sock(sk);
698 return 0;
699 }
700 /*
701 * Has it gone just too far ?
702 */
703 if (sk->retransmits > TCP_RETR2)
704 {
705 sk->err = ETIMEDOUT;
706 sk->error_report(sk);
707 del_timer(&sk->retransmit_timer);
708 /*
709 * Time wait the socket
710 */
711 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
712 {
713 tcp_set_state(sk,TCP_TIME_WAIT);
714 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
715 }
716 else
717 {
718 /*
719 * Clean up time.
720 */
721 tcp_set_state(sk, TCP_CLOSE);
722 release_sock(sk);
723 return 0;
724 }
725 }
726 return 1;
727 }
728
729 /*
730 * The TCP retransmit timer. This lacks a few small details.
731 *
732 * 1. An initial rtt timeout on the probe0 should cause what we can
733 * of the first write queue buffer to be split and sent.
734 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
735 * ETIMEDOUT if we know an additional 'soft' error caused this.
736 * tcp_err should save a 'soft error' for us.
737 */
738
739 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
740 {
741 struct sock *sk = (struct sock*)data;
742 int why = sk->ip_xmit_timeout;
743
744 /*
745 * only process if socket is not in use
746 */
747
748 cli();
749 if (sk->inuse || in_bh)
750 {
751 /* Try again in 1 second */
752 sk->retransmit_timer.expires = jiffies+HZ;
753 add_timer(&sk->retransmit_timer);
754 sti();
755 return;
756 }
757
758 sk->inuse = 1;
759 sti();
760
761 /* Always see if we need to send an ack. */
762
763 if (sk->ack_backlog && !sk->zapped)
764 {
765 sk->prot->read_wakeup (sk);
766 if (! sk->dead)
767 sk->data_ready(sk,0);
768 }
769
770 /* Now we need to figure out why the socket was on the timer. */
771
772 switch (why)
773 {
774 /* Window probing */
775 case TIME_PROBE0:
776 tcp_send_probe0(sk);
777 tcp_write_timeout(sk);
778 break;
779 /* Retransmitting */
780 case TIME_WRITE:
781 /* It could be we got here because we needed to send an ack.
782 * So we need to check for that.
783 */
784 {
785 struct sk_buff *skb;
786 unsigned long flags;
787
788 save_flags(flags);
789 cli();
790 skb = sk->send_head;
791 if (!skb)
792 {
793 restore_flags(flags);
794 }
795 else
796 {
797 /*
798 * Kicked by a delayed ack. Reset timer
799 * correctly now
800 */
801 if (jiffies < skb->when + sk->rto)
802 {
803 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
804 restore_flags(flags);
805 break;
806 }
807 restore_flags(flags);
808 /*
809 * Retransmission
810 */
811 sk->prot->retransmit (sk, 0);
812 tcp_write_timeout(sk);
813 }
814 break;
815 }
816 /* Sending Keepalives */
817 case TIME_KEEPOPEN:
818 /*
819 * this reset_timer() call is a hack, this is not
820 * how KEEPOPEN is supposed to work.
821 */
822 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
823
824 /* Send something to keep the connection open. */
825 if (sk->prot->write_wakeup)
826 sk->prot->write_wakeup (sk);
827 sk->retransmits++;
828 sk->prot->retransmits++;
829 tcp_write_timeout(sk);
830 break;
831 default:
832 printk ("rexmit_timer: timer expired - reason unknown\n");
833 break;
834 }
835 release_sock(sk);
836 }
837
838 /*
839 * This routine is called by the ICMP module when it gets some
840 * sort of error condition. If err < 0 then the socket should
841 * be closed and the error returned to the user. If err > 0
842 * it's just the icmp type << 8 | icmp code. After adjustment
843 * header points to the first 8 bytes of the tcp header. We need
844 * to find the appropriate port.
845 */
846
847 void tcp_err(int err, unsigned char *header, unsigned long daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
848 unsigned long saddr, struct inet_protocol *protocol)
849 {
850 struct tcphdr *th;
851 struct sock *sk;
852 struct iphdr *iph=(struct iphdr *)header;
853
854 header+=4*iph->ihl;
855
856
857 th =(struct tcphdr *)header;
858 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
859
860 if (sk == NULL)
861 return;
862
863 if(err<0)
864 {
865 sk->err = -err;
866 sk->error_report(sk);
867 return;
868 }
869
870 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
871 {
872 /*
873 * FIXME:
874 * For now we will just trigger a linear backoff.
875 * The slow start code should cause a real backoff here.
876 */
877 if (sk->cong_window > 4)
878 sk->cong_window--;
879 return;
880 }
881
882 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */
883
884 /*
885 * If we've already connected we will keep trying
886 * until we time out, or the user gives up.
887 */
888
889 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
890 {
891 if (sk->state == TCP_SYN_SENT)
892 {
893 tcp_statistics.TcpAttemptFails++;
894 tcp_set_state(sk,TCP_CLOSE);
895 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
896 }
897 sk->err = icmp_err_convert[err & 0xff].errno;
898 }
899 return;
900 }
901
902
903 /*
904 * Walk down the receive queue counting readable data until we hit the end or we find a gap
905 * in the received data queue (ie a frame missing that needs sending to us). Not
906 * sorting using two queues as data arrives makes life so much harder.
907 */
908
909 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
910 {
911 unsigned long counted;
912 unsigned long amount;
913 struct sk_buff *skb;
914 int sum;
915 unsigned long flags;
916
917 if(sk && sk->debug)
918 printk("tcp_readable: %p - ",sk);
919
920 save_flags(flags);
921 cli();
922 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
923 {
924 restore_flags(flags);
925 if(sk && sk->debug)
926 printk("empty\n");
927 return(0);
928 }
929
930 counted = sk->copied_seq; /* Where we are at the moment */
931 amount = 0;
932
933 /*
934 * Do until a push or until we are out of data.
935 */
936
937 do
938 {
939 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
940 break;
941 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
942 if (skb->h.th->syn)
943 sum++;
944 if (sum > 0)
945 { /* Add it up, move on */
946 amount += sum;
947 if (skb->h.th->syn)
948 amount--;
949 counted += sum;
950 }
951 /*
952 * Don't count urg data ... but do it in the right place!
953 * Consider: "old_data (ptr is here) URG PUSH data"
954 * The old code would stop at the first push because
955 * it counted the urg (amount==1) and then does amount--
956 * *after* the loop. This means tcp_readable() always
957 * returned zero if any URG PUSH was in the queue, even
958 * though there was normal data available. If we subtract
959 * the urg data right here, we even get it to work for more
960 * than one URG PUSH skb without normal data.
961 * This means that select() finally works now with urg data
962 * in the queue. Note that rlogin was never affected
963 * because it doesn't use select(); it uses two processes
964 * and a blocking read(). And the queue scan in tcp_read()
965 * was correct. Mike <pall@rz.uni-karlsruhe.de>
966 */
967 if (skb->h.th->urg)
968 amount--; /* don't count urg data */
969 if (amount && skb->h.th->psh) break;
970 skb = skb->next;
971 }
972 while(skb != (struct sk_buff *)&sk->receive_queue);
973
974 restore_flags(flags);
975 if(sk->debug)
976 printk("got %lu bytes.\n",amount);
977 return(amount);
978 }
979
980 /*
981 * LISTEN is a special case for select..
982 */
983 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
984 {
985 if (sel_type == SEL_IN) {
986 int retval;
987
988 sk->inuse = 1;
989 retval = (tcp_find_established(sk) != NULL);
990 release_sock(sk);
991 if (!retval)
992 select_wait(&master_select_wakeup,wait);
993 return retval;
994 }
995 return 0;
996 }
997
998
999 /*
1000 * Wait for a TCP event.
1001 *
1002 * Note that we don't need to set "sk->inuse", as the upper select layers
1003 * take care of normal races (between the test and the event) and we don't
1004 * go look at any of the socket buffers directly.
1005 */
1006 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1007 {
1008 if (sk->state == TCP_LISTEN)
1009 return tcp_listen_select(sk, sel_type, wait);
1010
1011 switch(sel_type) {
1012 case SEL_IN:
1013 if (sk->err)
1014 return 1;
1015 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1016 break;
1017
1018 if (sk->shutdown & RCV_SHUTDOWN)
1019 return 1;
1020
1021 if (sk->acked_seq == sk->copied_seq)
1022 break;
1023
1024 if (sk->urg_seq != sk->copied_seq ||
1025 sk->acked_seq != sk->copied_seq+1 ||
1026 sk->urginline || !sk->urg_data)
1027 return 1;
1028 break;
1029
1030 case SEL_OUT:
1031 if (sk->err)
1032 return 1;
1033 if (sk->shutdown & SEND_SHUTDOWN)
1034 return 0;
1035 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1036 break;
1037 /*
1038 * This is now right thanks to a small fix
1039 * by Matt Dillon.
1040 */
1041
1042 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1043 break;
1044 return 1;
1045
1046 case SEL_EX:
1047 if (sk->urg_data)
1048 return 1;
1049 break;
1050 }
1051 select_wait(sk->sleep, wait);
1052 return 0;
1053 }
1054
1055 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1056 {
1057 int err;
1058 switch(cmd)
1059 {
1060
1061 case TIOCINQ:
1062 #ifdef FIXME /* FIXME: */
1063 case FIONREAD:
1064 #endif
1065 {
1066 unsigned long amount;
1067
1068 if (sk->state == TCP_LISTEN)
1069 return(-EINVAL);
1070
1071 sk->inuse = 1;
1072 amount = tcp_readable(sk);
1073 release_sock(sk);
1074 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1075 if(err)
1076 return err;
1077 put_user(amount, (int *)arg);
1078 return(0);
1079 }
1080 case SIOCATMARK:
1081 {
1082 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1083
1084 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1085 if (err)
1086 return err;
1087 put_user(answ,(int *) arg);
1088 return(0);
1089 }
1090 case TIOCOUTQ:
1091 {
1092 unsigned long amount;
1093
1094 if (sk->state == TCP_LISTEN) return(-EINVAL);
1095 amount = sk->prot->wspace(sk);
1096 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1097 if(err)
1098 return err;
1099 put_user(amount, (int *)arg);
1100 return(0);
1101 }
1102 default:
1103 return(-EINVAL);
1104 }
1105 }
1106
1107
1108 /*
1109 * This routine computes a TCP checksum.
1110 *
1111 * Modified January 1995 from a go-faster DOS routine by
1112 * Jorge Cwik <jorge@laser.satlink.net>
1113 */
1114
1115 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1116 unsigned long saddr, unsigned long daddr, unsigned long base)
1117 {
1118 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1119 }
1120
1121
1122
1123 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1124 unsigned long daddr, int len, struct sock *sk)
1125 {
1126 th->check = 0;
1127 th->check = tcp_check(th, len, saddr, daddr,
1128 csum_partial((char *)th,len,0));
1129 return;
1130 }
1131
1132 /*
1133 * This is the main buffer sending routine. We queue the buffer
1134 * having checked it is sane seeming.
1135 */
1136
1137 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1138 {
1139 int size;
1140 struct tcphdr * th = skb->h.th;
1141
1142 /*
1143 * length of packet (not counting length of pre-tcp headers)
1144 */
1145
1146 size = skb->len - ((unsigned char *) th - skb->data);
1147
1148 /*
1149 * Sanity check it..
1150 */
1151
1152 if (size < sizeof(struct tcphdr) || size > skb->len)
1153 {
1154 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1155 skb, skb->data, th, skb->len);
1156 kfree_skb(skb, FREE_WRITE);
1157 return;
1158 }
1159
1160 /*
1161 * If we have queued a header size packet.. (these crash a few
1162 * tcp stacks if ack is not set)
1163 */
1164
1165 if (size == sizeof(struct tcphdr))
1166 {
1167 /* If it's got a syn or fin it's notionally included in the size..*/
1168 if(!th->syn && !th->fin)
1169 {
1170 printk("tcp_send_skb: attempt to queue a bogon.\n");
1171 kfree_skb(skb,FREE_WRITE);
1172 return;
1173 }
1174 }
1175
1176 /*
1177 * Actual processing.
1178 */
1179
1180 tcp_statistics.TcpOutSegs++;
1181 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1182
1183 /*
1184 * We must queue if
1185 *
1186 * a) The right edge of this frame exceeds the window
1187 * b) We are retransmitting (Nagle's rule)
1188 * c) We have too many packets 'in flight'
1189 */
1190
1191 if (after(skb->h.seq, sk->window_seq) ||
1192 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1193 sk->packets_out >= sk->cong_window)
1194 {
1195 /* checksum will be supplied by tcp_write_xmit. So
1196 * we shouldn't need to set it at all. I'm being paranoid */
1197 th->check = 0;
1198 if (skb->next != NULL)
1199 {
1200 printk("tcp_send_partial: next != NULL\n");
1201 skb_unlink(skb);
1202 }
1203 skb_queue_tail(&sk->write_queue, skb);
1204
1205 /*
1206 * If we don't fit we have to start the zero window
1207 * probes. This is broken - we really need to do a partial
1208 * send _first_ (This is what causes the Cisco and PC/TCP
1209 * grief).
1210 */
1211
1212 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1213 sk->send_head == NULL && sk->ack_backlog == 0)
1214 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1215 }
1216 else
1217 {
1218 /*
1219 * This is going straight out
1220 */
1221
1222 th->ack_seq = ntohl(sk->acked_seq);
1223 th->window = ntohs(tcp_select_window(sk));
1224
1225 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1226
1227 sk->sent_seq = sk->write_seq;
1228
1229 /*
1230 * This is mad. The tcp retransmit queue is put together
1231 * by the ip layer. This causes half the problems with
1232 * unroutable FIN's and other things.
1233 */
1234
1235 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1236
1237 /*
1238 * Set for next retransmit based on expected ACK time.
1239 * FIXME: We set this every time which means our
1240 * retransmits are really about a window behind.
1241 */
1242
1243 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1244 }
1245 }
1246
1247 /*
1248 * Locking problems lead us to a messy situation where we can have
1249 * multiple partially complete buffers queued up. This is really bad
1250 * as we don't want to be sending partial buffers. Fix this with
1251 * a semaphore or similar to lock tcp_write per socket.
1252 *
1253 * These routines are pretty self descriptive.
1254 */
1255
1256 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1257 {
1258 struct sk_buff * skb;
1259 unsigned long flags;
1260
1261 save_flags(flags);
1262 cli();
1263 skb = sk->partial;
1264 if (skb) {
1265 sk->partial = NULL;
1266 del_timer(&sk->partial_timer);
1267 }
1268 restore_flags(flags);
1269 return skb;
1270 }
1271
1272 /*
1273 * Empty the partial queue
1274 */
1275
1276 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1277 {
1278 struct sk_buff *skb;
1279
1280 if (sk == NULL)
1281 return;
1282 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1283 tcp_send_skb(sk, skb);
1284 }
1285
1286 /*
1287 * Queue a partial frame
1288 */
1289
1290 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1291 {
1292 struct sk_buff * tmp;
1293 unsigned long flags;
1294
1295 save_flags(flags);
1296 cli();
1297 tmp = sk->partial;
1298 if (tmp)
1299 del_timer(&sk->partial_timer);
1300 sk->partial = skb;
1301 init_timer(&sk->partial_timer);
1302 /*
1303 * Wait up to 1 second for the buffer to fill.
1304 */
1305 sk->partial_timer.expires = jiffies+HZ;
1306 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1307 sk->partial_timer.data = (unsigned long) sk;
1308 add_timer(&sk->partial_timer);
1309 restore_flags(flags);
1310 if (tmp)
1311 tcp_send_skb(sk, tmp);
1312 }
1313
1314
1315 /*
1316 * This routine sends an ack and also updates the window.
1317 */
1318
1319 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1320 struct sock *sk,
1321 struct tcphdr *th, unsigned long daddr)
1322 {
1323 struct sk_buff *buff;
1324 struct tcphdr *t1;
1325 struct device *dev = NULL;
1326 int tmp;
1327
1328 if(sk->zapped)
1329 return; /* We have been reset, we may not send again */
1330
1331 /*
1332 * We need to grab some memory, and put together an ack,
1333 * and then put it into the queue to be sent.
1334 */
1335
1336 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1337 if (buff == NULL)
1338 {
1339 /*
1340 * Force it to send an ack. We don't have to do this
1341 * (ACK is unreliable) but it's much better use of
1342 * bandwidth on slow links to send a spare ack than
1343 * resend packets.
1344 */
1345
1346 sk->ack_backlog++;
1347 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1348 {
1349 reset_xmit_timer(sk, TIME_WRITE, HZ);
1350 }
1351 return;
1352 }
1353
1354 /*
1355 * Assemble a suitable TCP frame
1356 */
1357
1358 buff->sk = sk;
1359 buff->localroute = sk->localroute;
1360
1361 /*
1362 * Put in the IP header and routing stuff.
1363 */
1364
1365 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1366 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1367 if (tmp < 0)
1368 {
1369 buff->free = 1;
1370 sk->prot->wfree(sk, buff);
1371 return;
1372 }
1373 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1374
1375 memcpy(t1, th, sizeof(*t1));
1376
1377 /*
1378 * Swap the send and the receive.
1379 */
1380
1381 t1->dest = th->source;
1382 t1->source = th->dest;
1383 t1->seq = ntohl(sequence);
1384 t1->ack = 1;
1385 sk->window = tcp_select_window(sk);
1386 t1->window = ntohs(sk->window);
1387 t1->res1 = 0;
1388 t1->res2 = 0;
1389 t1->rst = 0;
1390 t1->urg = 0;
1391 t1->syn = 0;
1392 t1->psh = 0;
1393 t1->fin = 0;
1394
1395 /*
1396 * If we have nothing queued for transmit and the transmit timer
1397 * is on we are just doing an ACK timeout and need to switch
1398 * to a keepalive.
1399 */
1400
1401 if (ack == sk->acked_seq)
1402 {
1403 sk->ack_backlog = 0;
1404 sk->bytes_rcv = 0;
1405 sk->ack_timed = 0;
1406 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1407 && sk->ip_xmit_timeout == TIME_WRITE)
1408 {
1409 if(sk->keepopen) {
1410 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1411 } else {
1412 delete_timer(sk);
1413 }
1414 }
1415 }
1416
1417 /*
1418 * Fill in the packet and send it
1419 */
1420
1421 t1->ack_seq = ntohl(ack);
1422 t1->doff = sizeof(*t1)/4;
1423 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1424 if (sk->debug)
1425 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1426 tcp_statistics.TcpOutSegs++;
1427 sk->prot->queue_xmit(sk, dev, buff, 1);
1428 }
1429
1430
1431 /*
1432 * This routine builds a generic TCP header.
1433 */
1434
1435 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1436 {
1437
1438 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1439 th->seq = htonl(sk->write_seq);
1440 th->psh =(push == 0) ? 1 : 0;
1441 th->doff = sizeof(*th)/4;
1442 th->ack = 1;
1443 th->fin = 0;
1444 sk->ack_backlog = 0;
1445 sk->bytes_rcv = 0;
1446 sk->ack_timed = 0;
1447 th->ack_seq = htonl(sk->acked_seq);
1448 sk->window = tcp_select_window(sk);
1449 th->window = htons(sk->window);
1450
1451 return(sizeof(*th));
1452 }
1453
1454 /*
1455 * This routine copies from a user buffer into a socket,
1456 * and starts the transmit system.
1457 */
1458
1459 static int tcp_write(struct sock *sk, const unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1460 int len, int nonblock, unsigned flags)
1461 {
1462 int copied = 0;
1463 int copy;
1464 int tmp;
1465 struct sk_buff *skb;
1466 struct sk_buff *send_tmp;
1467 struct proto *prot;
1468 struct device *dev = NULL;
1469
1470 sk->inuse=1;
1471 prot = sk->prot;
1472 while(len > 0)
1473 {
1474 if (sk->err)
1475 { /* Stop on an error */
1476 release_sock(sk);
1477 if (copied)
1478 return(copied);
1479 tmp = -sk->err;
1480 sk->err = 0;
1481 return(tmp);
1482 }
1483
1484 /*
1485 * First thing we do is make sure that we are established.
1486 */
1487
1488 if (sk->shutdown & SEND_SHUTDOWN)
1489 {
1490 release_sock(sk);
1491 sk->err = EPIPE;
1492 if (copied)
1493 return(copied);
1494 sk->err = 0;
1495 return(-EPIPE);
1496 }
1497
1498 /*
1499 * Wait for a connection to finish.
1500 */
1501
1502 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1503 {
1504 if (sk->err)
1505 {
1506 release_sock(sk);
1507 if (copied)
1508 return(copied);
1509 tmp = -sk->err;
1510 sk->err = 0;
1511 return(tmp);
1512 }
1513
1514 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1515 {
1516 release_sock(sk);
1517 if (copied)
1518 return(copied);
1519
1520 if (sk->err)
1521 {
1522 tmp = -sk->err;
1523 sk->err = 0;
1524 return(tmp);
1525 }
1526
1527 if (sk->keepopen)
1528 {
1529 send_sig(SIGPIPE, current, 0);
1530 }
1531 return(-EPIPE);
1532 }
1533
1534 if (nonblock || copied)
1535 {
1536 release_sock(sk);
1537 if (copied)
1538 return(copied);
1539 return(-EAGAIN);
1540 }
1541
1542 release_sock(sk);
1543 cli();
1544
1545 if (sk->state != TCP_ESTABLISHED &&
1546 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1547 {
1548 interruptible_sleep_on(sk->sleep);
1549 if (current->signal & ~current->blocked)
1550 {
1551 sti();
1552 if (copied)
1553 return(copied);
1554 return(-ERESTARTSYS);
1555 }
1556 }
1557 sk->inuse = 1;
1558 sti();
1559 }
1560
1561 /*
1562 * The following code can result in copy <= if sk->mss is ever
1563 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1564 * sk->mtu is constant once SYN processing is finished. I.e. we
1565 * had better not get here until we've seen his SYN and at least one
1566 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1567 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1568 * non-decreasing. Note that any ioctl to set user_mss must be done
1569 * before the exchange of SYN's. If the initial ack from the other
1570 * end has a window of 0, max_window and thus mss will both be 0.
1571 */
1572
1573 /*
1574 * Now we need to check if we have a half built packet.
1575 */
1576
1577 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1578 {
1579 int hdrlen;
1580
1581 /* IP header + TCP header */
1582 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1583 + sizeof(struct tcphdr);
1584
1585 /* Add more stuff to the end of skb->len */
1586 if (!(flags & MSG_OOB))
1587 {
1588 copy = min(sk->mss - (skb->len - hdrlen), len);
1589 /* FIXME: this is really a bug. */
1590 if (copy <= 0)
1591 {
1592 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1593 copy = 0;
1594 }
1595
1596 memcpy_fromfs(skb_put(skb,copy), from, copy);
1597 from += copy;
1598 copied += copy;
1599 len -= copy;
1600 sk->write_seq += copy;
1601 }
1602 if ((skb->len - hdrlen) >= sk->mss ||
1603 (flags & MSG_OOB) || !sk->packets_out)
1604 tcp_send_skb(sk, skb);
1605 else
1606 tcp_enqueue_partial(skb, sk);
1607 continue;
1608 }
1609
1610 /*
1611 * We also need to worry about the window.
1612 * If window < 1/2 the maximum window we've seen from this
1613 * host, don't use it. This is sender side
1614 * silly window prevention, as specified in RFC1122.
1615 * (Note that this is different than earlier versions of
1616 * SWS prevention, e.g. RFC813.). What we actually do is
1617 * use the whole MSS. Since the results in the right
1618 * edge of the packet being outside the window, it will
1619 * be queued for later rather than sent.
1620 */
1621
1622 copy = sk->window_seq - sk->write_seq;
1623 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1624 copy = sk->mss;
1625 if (copy > len)
1626 copy = len;
1627
1628 /*
1629 * We should really check the window here also.
1630 */
1631
1632 send_tmp = NULL;
1633 if (copy < sk->mss && !(flags & MSG_OOB))
1634 {
1635 /*
1636 * We will release the socket in case we sleep here.
1637 */
1638 release_sock(sk);
1639 /*
1640 * NB: following must be mtu, because mss can be increased.
1641 * mss is always <= mtu
1642 */
1643 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1644 sk->inuse = 1;
1645 send_tmp = skb;
1646 }
1647 else
1648 {
1649 /*
1650 * We will release the socket in case we sleep here.
1651 */
1652 release_sock(sk);
1653 skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1654 sk->inuse = 1;
1655 }
1656
1657 /*
1658 * If we didn't get any memory, we need to sleep.
1659 */
1660
1661 if (skb == NULL)
1662 {
1663 sk->socket->flags |= SO_NOSPACE;
1664 if (nonblock)
1665 {
1666 release_sock(sk);
1667 if (copied)
1668 return(copied);
1669 return(-EAGAIN);
1670 }
1671
1672 /*
1673 * FIXME: here is another race condition.
1674 */
1675
1676 tmp = sk->wmem_alloc;
1677 release_sock(sk);
1678 cli();
1679 /*
1680 * Again we will try to avoid it.
1681 */
1682 if (tmp <= sk->wmem_alloc &&
1683 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1684 && sk->err == 0)
1685 {
1686 sk->socket->flags &= ~SO_NOSPACE;
1687 interruptible_sleep_on(sk->sleep);
1688 if (current->signal & ~current->blocked)
1689 {
1690 sti();
1691 if (copied)
1692 return(copied);
1693 return(-ERESTARTSYS);
1694 }
1695 }
1696 sk->inuse = 1;
1697 sti();
1698 continue;
1699 }
1700
1701 skb->sk = sk;
1702 skb->free = 0;
1703 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1704
1705 /*
1706 * FIXME: we need to optimize this.
1707 * Perhaps some hints here would be good.
1708 */
1709
1710 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1711 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1712 if (tmp < 0 )
1713 {
1714 prot->wfree(sk, skb);
1715 release_sock(sk);
1716 if (copied)
1717 return(copied);
1718 return(tmp);
1719 }
1720 skb->dev = dev;
1721 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1722 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1723 if (tmp < 0)
1724 {
1725 prot->wfree(sk, skb);
1726 release_sock(sk);
1727 if (copied)
1728 return(copied);
1729 return(tmp);
1730 }
1731
1732 if (flags & MSG_OOB)
1733 {
1734 skb->h.th->urg = 1;
1735 skb->h.th->urg_ptr = ntohs(copy);
1736 }
1737
1738 memcpy_fromfs(skb_put(skb,copy), from, copy);
1739
1740 from += copy;
1741 copied += copy;
1742 len -= copy;
1743 skb->free = 0;
1744 sk->write_seq += copy;
1745
1746 if (send_tmp != NULL && sk->packets_out)
1747 {
1748 tcp_enqueue_partial(send_tmp, sk);
1749 continue;
1750 }
1751 tcp_send_skb(sk, skb);
1752 }
1753 sk->err = 0;
1754
1755 /*
1756 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1757 * interactive fast network servers. It's meant to be on and
1758 * it really improves the throughput though not the echo time
1759 * on my slow slip link - Alan
1760 */
1761
1762 /*
1763 * Avoid possible race on send_tmp - c/o Johannes Stille
1764 */
1765
1766 if(sk->partial && ((!sk->packets_out)
1767 /* If not nagling we can send on the before case too.. */
1768 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1769 ))
1770 tcp_send_partial(sk);
1771
1772 release_sock(sk);
1773 return(copied);
1774 }
1775
1776 /*
1777 * This is just a wrapper.
1778 */
1779
1780 static int tcp_sendto(struct sock *sk, const unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1781 int len, int nonblock, unsigned flags,
1782 struct sockaddr_in *addr, int addr_len)
1783 {
1784 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1785 return -EINVAL;
1786 if (sk->state == TCP_CLOSE)
1787 return -ENOTCONN;
1788 if (addr_len < sizeof(*addr))
1789 return -EINVAL;
1790 if (addr->sin_family && addr->sin_family != AF_INET)
1791 return -EINVAL;
1792 if (addr->sin_port != sk->dummy_th.dest)
1793 return -EISCONN;
1794 if (addr->sin_addr.s_addr != sk->daddr)
1795 return -EISCONN;
1796 return tcp_write(sk, from, len, nonblock, flags);
1797 }
1798
1799
1800 /*
1801 * Send an ack if one is backlogged at this point. Ought to merge
1802 * this with tcp_send_ack().
1803 */
1804
1805 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1806 {
1807 int tmp;
1808 struct device *dev = NULL;
1809 struct tcphdr *t1;
1810 struct sk_buff *buff;
1811
1812 if (!sk->ack_backlog)
1813 return;
1814
1815 /*
1816 * If we're closed, don't send an ack, or we'll get a RST
1817 * from the closed destination.
1818 */
1819 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1820 return;
1821
1822 /*
1823 * FIXME: we need to put code here to prevent this routine from
1824 * being called. Being called once in a while is ok, so only check
1825 * if this is the second time in a row.
1826 */
1827
1828 /*
1829 * We need to grab some memory, and put together an ack,
1830 * and then put it into the queue to be sent.
1831 */
1832
1833 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1834 if (buff == NULL)
1835 {
1836 /* Try again real soon. */
1837 reset_xmit_timer(sk, TIME_WRITE, HZ);
1838 return;
1839 }
1840
1841 buff->sk = sk;
1842 buff->localroute = sk->localroute;
1843
1844 /*
1845 * Put in the IP header and routing stuff.
1846 */
1847
1848 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1849 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1850 if (tmp < 0)
1851 {
1852 buff->free = 1;
1853 sk->prot->wfree(sk, buff);
1854 return;
1855 }
1856
1857 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1858
1859 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1860 t1->seq = htonl(sk->sent_seq);
1861 t1->ack = 1;
1862 t1->res1 = 0;
1863 t1->res2 = 0;
1864 t1->rst = 0;
1865 t1->urg = 0;
1866 t1->syn = 0;
1867 t1->psh = 0;
1868 sk->ack_backlog = 0;
1869 sk->bytes_rcv = 0;
1870 sk->window = tcp_select_window(sk);
1871 t1->window = ntohs(sk->window);
1872 t1->ack_seq = ntohl(sk->acked_seq);
1873 t1->doff = sizeof(*t1)/4;
1874 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1875 sk->prot->queue_xmit(sk, dev, buff, 1);
1876 tcp_statistics.TcpOutSegs++;
1877 }
1878
1879
1880 /*
1881 * FIXME:
1882 * This routine frees used buffers.
1883 * It should consider sending an ACK to let the
1884 * other end know we now have a bigger window.
1885 */
1886
1887 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1888 {
1889 unsigned long flags;
1890 unsigned long left;
1891 struct sk_buff *skb;
1892 unsigned long rspace;
1893
1894 if(sk->debug)
1895 printk("cleaning rbuf for sk=%p\n", sk);
1896
1897 save_flags(flags);
1898 cli();
1899
1900 left = sk->prot->rspace(sk);
1901
1902 /*
1903 * We have to loop through all the buffer headers,
1904 * and try to free up all the space we can.
1905 */
1906
1907 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1908 {
1909 if (!skb->used || skb->users)
1910 break;
1911 skb_unlink(skb);
1912 skb->sk = sk;
1913 kfree_skb(skb, FREE_READ);
1914 }
1915
1916 restore_flags(flags);
1917
1918 /*
1919 * FIXME:
1920 * At this point we should send an ack if the difference
1921 * in the window, and the amount of space is bigger than
1922 * TCP_WINDOW_DIFF.
1923 */
1924
1925 if(sk->debug)
1926 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1927 left);
1928 if ((rspace=sk->prot->rspace(sk)) != left)
1929 {
1930 /*
1931 * This area has caused the most trouble. The current strategy
1932 * is to simply do nothing if the other end has room to send at
1933 * least 3 full packets, because the ack from those will auto-
1934 * matically update the window. If the other end doesn't think
1935 * we have much space left, but we have room for at least 1 more
1936 * complete packet than it thinks we do, we will send an ack
1937 * immediately. Otherwise we will wait up to .5 seconds in case
1938 * the user reads some more.
1939 */
1940 sk->ack_backlog++;
1941 /*
1942 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1943 * if the other end is offering a window smaller than the agreed on MSS
1944 * (called sk->mtu here). In theory there's no connection between send
1945 * and receive, and so no reason to think that they're going to send
1946 * small packets. For the moment I'm using the hack of reducing the mss
1947 * only on the send side, so I'm putting mtu here.
1948 */
1949
1950 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1951 {
1952 /* Send an ack right now. */
1953 tcp_read_wakeup(sk);
1954 }
1955 else
1956 {
1957 /* Force it to send an ack soon. */
1958 int was_active = del_timer(&sk->retransmit_timer);
1959 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
1960 {
1961 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1962 }
1963 else
1964 add_timer(&sk->retransmit_timer);
1965 }
1966 }
1967 }
1968
1969
1970 /*
1971 * Handle reading urgent data. BSD has very simple semantics for
1972 * this, no blocking and very strange errors 8)
1973 */
1974
1975 static int tcp_read_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1976 unsigned char *to, int len, unsigned flags)
1977 {
1978 /*
1979 * No URG data to read
1980 */
1981 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1982 return -EINVAL; /* Yes this is right ! */
1983
1984 if (sk->err)
1985 {
1986 int tmp = -sk->err;
1987 sk->err = 0;
1988 return tmp;
1989 }
1990
1991 if (sk->state == TCP_CLOSE || sk->done)
1992 {
1993 if (!sk->done) {
1994 sk->done = 1;
1995 return 0;
1996 }
1997 return -ENOTCONN;
1998 }
1999
2000 if (sk->shutdown & RCV_SHUTDOWN)
2001 {
2002 sk->done = 1;
2003 return 0;
2004 }
2005 sk->inuse = 1;
2006 if (sk->urg_data & URG_VALID)
2007 {
2008 char c = sk->urg_data;
2009 if (!(flags & MSG_PEEK))
2010 sk->urg_data = URG_READ;
2011 put_fs_byte(c, to);
2012 release_sock(sk);
2013 return 1;
2014 }
2015 release_sock(sk);
2016
2017 /*
2018 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
2019 * the available implementations agree in this case:
2020 * this call should never block, independent of the
2021 * blocking state of the socket.
2022 * Mike <pall@rz.uni-karlsruhe.de>
2023 */
2024 return -EAGAIN;
2025 }
2026
2027
2028 /*
2029 * This routine copies from a sock struct into the user buffer.
2030 */
2031
2032 static int tcp_read(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2033 int len, int nonblock, unsigned flags)
2034 {
2035 struct wait_queue wait = { current, NULL };
2036 int copied = 0;
2037 u32 peek_seq;
2038 volatile u32 *seq; /* So gcc doesn't overoptimise */
2039 unsigned long used;
2040
2041 /*
2042 * This error should be checked.
2043 */
2044
2045 if (sk->state == TCP_LISTEN)
2046 return -ENOTCONN;
2047
2048 /*
2049 * Urgent data needs to be handled specially.
2050 */
2051
2052 if (flags & MSG_OOB)
2053 return tcp_read_urg(sk, nonblock, to, len, flags);
2054
2055 /*
2056 * Copying sequence to update. This is volatile to handle
2057 * the multi-reader case neatly (memcpy_to/fromfs might be
2058 * inline and thus not flush cached variables otherwise).
2059 */
2060
2061 peek_seq = sk->copied_seq;
2062 seq = &sk->copied_seq;
2063 if (flags & MSG_PEEK)
2064 seq = &peek_seq;
2065
2066 add_wait_queue(sk->sleep, &wait);
2067 sk->inuse = 1;
2068 while (len > 0)
2069 {
2070 struct sk_buff * skb;
2071 u32 offset;
2072
2073 /*
2074 * Are we at urgent data? Stop if we have read anything.
2075 */
2076
2077 if (copied && sk->urg_data && sk->urg_seq == *seq)
2078 break;
2079
2080 /*
2081 * Next get a buffer.
2082 */
2083
2084 current->state = TASK_INTERRUPTIBLE;
2085
2086 skb = skb_peek(&sk->receive_queue);
2087 do
2088 {
2089 if (!skb)
2090 break;
2091 if (before(*seq, skb->h.th->seq))
2092 break;
2093 offset = *seq - skb->h.th->seq;
2094 if (skb->h.th->syn)
2095 offset--;
2096 if (offset < skb->len)
2097 goto found_ok_skb;
2098 if (skb->h.th->fin)
2099 goto found_fin_ok;
2100 if (!(flags & MSG_PEEK))
2101 skb->used = 1;
2102 skb = skb->next;
2103 }
2104 while (skb != (struct sk_buff *)&sk->receive_queue);
2105
2106 if (copied)
2107 break;
2108
2109 if (sk->err)
2110 {
2111 copied = -sk->err;
2112 sk->err = 0;
2113 break;
2114 }
2115
2116 if (sk->state == TCP_CLOSE)
2117 {
2118 if (!sk->done)
2119 {
2120 sk->done = 1;
2121 break;
2122 }
2123 copied = -ENOTCONN;
2124 break;
2125 }
2126
2127 if (sk->shutdown & RCV_SHUTDOWN)
2128 {
2129 sk->done = 1;
2130 break;
2131 }
2132
2133 if (nonblock)
2134 {
2135 copied = -EAGAIN;
2136 break;
2137 }
2138
2139 cleanup_rbuf(sk);
2140 release_sock(sk);
2141 sk->socket->flags |= SO_WAITDATA;
2142 schedule();
2143 sk->socket->flags &= ~SO_WAITDATA;
2144 sk->inuse = 1;
2145
2146 if (current->signal & ~current->blocked)
2147 {
2148 copied = -ERESTARTSYS;
2149 break;
2150 }
2151 continue;
2152
2153 found_ok_skb:
2154 /*
2155 * Lock the buffer. We can be fairly relaxed as
2156 * an interrupt will never steal a buffer we are
2157 * using unless I've missed something serious in
2158 * tcp_data.
2159 */
2160
2161 skb->users++;
2162
2163 /*
2164 * Ok so how much can we use ?
2165 */
2166
2167 used = skb->len - offset;
2168 if (len < used)
2169 used = len;
2170 /*
2171 * Do we have urgent data here?
2172 */
2173
2174 if (sk->urg_data)
2175 {
2176 u32 urg_offset = sk->urg_seq - *seq;
2177 if (urg_offset < used)
2178 {
2179 if (!urg_offset)
2180 {
2181 if (!sk->urginline)
2182 {
2183 ++*seq;
2184 offset++;
2185 used--;
2186 }
2187 }
2188 else
2189 used = urg_offset;
2190 }
2191 }
2192
2193 /*
2194 * Copy it - We _MUST_ update *seq first so that we
2195 * don't ever double read when we have dual readers
2196 */
2197
2198 *seq += used;
2199
2200 /*
2201 * This memcpy_tofs can sleep. If it sleeps and we
2202 * do a second read it relies on the skb->users to avoid
2203 * a crash when cleanup_rbuf() gets called.
2204 */
2205
2206 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2207 skb->h.th->doff*4 + offset, used);
2208 copied += used;
2209 len -= used;
2210 to += used;
2211
2212 /*
2213 * We now will not sleep again until we are finished
2214 * with skb. Sorry if you are doing the SMP port
2215 * but you'll just have to fix it neatly ;)
2216 */
2217
2218 skb->users --;
2219
2220 if (after(sk->copied_seq,sk->urg_seq))
2221 sk->urg_data = 0;
2222 if (used + offset < skb->len)
2223 continue;
2224
2225 /*
2226 * Process the FIN.
2227 */
2228
2229 if (skb->h.th->fin)
2230 goto found_fin_ok;
2231 if (flags & MSG_PEEK)
2232 continue;
2233 skb->used = 1;
2234 continue;
2235
2236 found_fin_ok:
2237 ++*seq;
2238 if (flags & MSG_PEEK)
2239 break;
2240
2241 /*
2242 * All is done
2243 */
2244
2245 skb->used = 1;
2246 sk->shutdown |= RCV_SHUTDOWN;
2247 break;
2248
2249 }
2250 remove_wait_queue(sk->sleep, &wait);
2251 current->state = TASK_RUNNING;
2252
2253 /* Clean up data we have read: This will do ACK frames */
2254 cleanup_rbuf(sk);
2255 release_sock(sk);
2256 return copied;
2257 }
2258
2259 /*
2260 * State processing on a close. This implements the state shift for
2261 * sending our FIN frame. Note that we only send a FIN for some
2262 * states. A shutdown() may have already sent the FIN, or we may be
2263 * closed.
2264 */
2265
2266 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2267 {
2268 int ns=TCP_CLOSE;
2269 int send_fin=0;
2270 switch(sk->state)
2271 {
2272 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2273 break;
2274 case TCP_SYN_RECV:
2275 case TCP_ESTABLISHED: /* Closedown begin */
2276 ns=TCP_FIN_WAIT1;
2277 send_fin=1;
2278 break;
2279 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2280 case TCP_FIN_WAIT2:
2281 case TCP_CLOSING:
2282 ns=sk->state;
2283 break;
2284 case TCP_CLOSE:
2285 case TCP_LISTEN:
2286 break;
2287 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2288 wait only for the ACK */
2289 ns=TCP_LAST_ACK;
2290 send_fin=1;
2291 }
2292
2293 tcp_set_state(sk,ns);
2294
2295 /*
2296 * This is a (useful) BSD violating of the RFC. There is a
2297 * problem with TCP as specified in that the other end could
2298 * keep a socket open forever with no application left this end.
2299 * We use a 3 minute timeout (about the same as BSD) then kill
2300 * our end. If they send after that then tough - BUT: long enough
2301 * that we won't make the old 4*rto = almost no time - whoops
2302 * reset mistake.
2303 */
2304 if(dead && ns==TCP_FIN_WAIT2)
2305 {
2306 int timer_active=del_timer(&sk->timer);
2307 if(timer_active)
2308 add_timer(&sk->timer);
2309 else
2310 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2311 }
2312
2313 return send_fin;
2314 }
2315
2316 /*
2317 * Send a fin.
2318 */
2319
2320 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2321 {
2322 struct proto *prot =(struct proto *)sk->prot;
2323 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2324 struct tcphdr *t1;
2325 struct sk_buff *buff;
2326 struct device *dev=NULL;
2327 int tmp;
2328
2329 release_sock(sk); /* in case the malloc sleeps. */
2330
2331 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2332 sk->inuse = 1;
2333
2334 if (buff == NULL)
2335 {
2336 /* This is a disaster if it occurs */
2337 printk("tcp_send_fin: Impossible malloc failure");
2338 return;
2339 }
2340
2341 /*
2342 * Administrivia
2343 */
2344
2345 buff->sk = sk;
2346 buff->localroute = sk->localroute;
2347
2348 /*
2349 * Put in the IP header and routing stuff.
2350 */
2351
2352 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2353 IPPROTO_TCP, sk->opt,
2354 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2355 if (tmp < 0)
2356 {
2357 int t;
2358 /*
2359 * Finish anyway, treat this as a send that got lost.
2360 * (Not good).
2361 */
2362
2363 buff->free = 1;
2364 prot->wfree(sk,buff);
2365 sk->write_seq++;
2366 t=del_timer(&sk->timer);
2367 if(t)
2368 add_timer(&sk->timer);
2369 else
2370 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2371 return;
2372 }
2373
2374 /*
2375 * We ought to check if the end of the queue is a buffer and
2376 * if so simply add the fin to that buffer, not send it ahead.
2377 */
2378
2379 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2380 buff->dev = dev;
2381 memcpy(t1, th, sizeof(*t1));
2382 t1->seq = ntohl(sk->write_seq);
2383 sk->write_seq++;
2384 buff->h.seq = sk->write_seq;
2385 t1->ack = 1;
2386 t1->ack_seq = ntohl(sk->acked_seq);
2387 t1->window = ntohs(sk->window=tcp_select_window(sk));
2388 t1->fin = 1;
2389 t1->rst = 0;
2390 t1->doff = sizeof(*t1)/4;
2391 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2392
2393 /*
2394 * If there is data in the write queue, the fin must be appended to
2395 * the write queue.
2396 */
2397
2398 if (skb_peek(&sk->write_queue) != NULL)
2399 {
2400 buff->free = 0;
2401 if (buff->next != NULL)
2402 {
2403 printk("tcp_send_fin: next != NULL\n");
2404 skb_unlink(buff);
2405 }
2406 skb_queue_tail(&sk->write_queue, buff);
2407 }
2408 else
2409 {
2410 sk->sent_seq = sk->write_seq;
2411 sk->prot->queue_xmit(sk, dev, buff, 0);
2412 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2413 }
2414 }
2415
2416 /*
2417 * Shutdown the sending side of a connection. Much like close except
2418 * that we don't receive shut down or set sk->dead=1.
2419 */
2420
2421 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2422 {
2423 /*
2424 * We need to grab some memory, and put together a FIN,
2425 * and then put it into the queue to be sent.
2426 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2427 */
2428
2429 if (!(how & SEND_SHUTDOWN))
2430 return;
2431
2432 /*
2433 * If we've already sent a FIN, or it's a closed state
2434 */
2435
2436 if (sk->state == TCP_FIN_WAIT1 ||
2437 sk->state == TCP_FIN_WAIT2 ||
2438 sk->state == TCP_CLOSING ||
2439 sk->state == TCP_LAST_ACK ||
2440 sk->state == TCP_TIME_WAIT ||
2441 sk->state == TCP_CLOSE ||
2442 sk->state == TCP_LISTEN
2443 )
2444 {
2445 return;
2446 }
2447 sk->inuse = 1;
2448
2449 /*
2450 * flag that the sender has shutdown
2451 */
2452
2453 sk->shutdown |= SEND_SHUTDOWN;
2454
2455 /*
2456 * Clear out any half completed packets.
2457 */
2458
2459 if (sk->partial)
2460 tcp_send_partial(sk);
2461
2462 /*
2463 * FIN if needed
2464 */
2465
2466 if(tcp_close_state(sk,0))
2467 tcp_send_fin(sk);
2468
2469 release_sock(sk);
2470 }
2471
2472
2473 static int
2474 tcp_recvfrom(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2475 int to_len, int nonblock, unsigned flags,
2476 struct sockaddr_in *addr, int *addr_len)
2477 {
2478 int result;
2479
2480 /*
2481 * Have to check these first unlike the old code. If
2482 * we check them after we lose data on an error
2483 * which is wrong
2484 */
2485
2486 if(addr_len)
2487 *addr_len = sizeof(*addr);
2488 result=tcp_read(sk, to, to_len, nonblock, flags);
2489
2490 if (result < 0)
2491 return(result);
2492
2493 if(addr)
2494 {
2495 addr->sin_family = AF_INET;
2496 addr->sin_port = sk->dummy_th.dest;
2497 addr->sin_addr.s_addr = sk->daddr;
2498 }
2499 return(result);
2500 }
2501
2502
2503 /*
2504 * This routine will send an RST to the other tcp.
2505 */
2506
2507 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2508 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2509 {
2510 struct sk_buff *buff;
2511 struct tcphdr *t1;
2512 int tmp;
2513 struct device *ndev=NULL;
2514
2515 /*
2516 * Cannot reset a reset (Think about it).
2517 */
2518
2519 if(th->rst)
2520 return;
2521
2522 /*
2523 * We need to grab some memory, and put together an RST,
2524 * and then put it into the queue to be sent.
2525 */
2526
2527 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2528 if (buff == NULL)
2529 return;
2530
2531 buff->sk = NULL;
2532 buff->dev = dev;
2533 buff->localroute = 0;
2534
2535 /*
2536 * Put in the IP header and routing stuff.
2537 */
2538
2539 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2540 sizeof(struct tcphdr),tos,ttl);
2541 if (tmp < 0)
2542 {
2543 buff->free = 1;
2544 prot->wfree(NULL, buff);
2545 return;
2546 }
2547
2548 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2549 memcpy(t1, th, sizeof(*t1));
2550
2551 /*
2552 * Swap the send and the receive.
2553 */
2554
2555 t1->dest = th->source;
2556 t1->source = th->dest;
2557 t1->rst = 1;
2558 t1->window = 0;
2559
2560 if(th->ack)
2561 {
2562 t1->ack = 0;
2563 t1->seq = th->ack_seq;
2564 t1->ack_seq = 0;
2565 }
2566 else
2567 {
2568 t1->ack = 1;
2569 if(!th->syn)
2570 t1->ack_seq=htonl(th->seq);
2571 else
2572 t1->ack_seq=htonl(th->seq+1);
2573 t1->seq=0;
2574 }
2575
2576 t1->syn = 0;
2577 t1->urg = 0;
2578 t1->fin = 0;
2579 t1->psh = 0;
2580 t1->doff = sizeof(*t1)/4;
2581 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2582 prot->queue_xmit(NULL, ndev, buff, 1);
2583 tcp_statistics.TcpOutSegs++;
2584 }
2585
2586
2587 /*
2588 * Look for tcp options. Parses everything but only knows about MSS.
2589 * This routine is always called with the packet containing the SYN.
2590 * However it may also be called with the ack to the SYN. So you
2591 * can't assume this is always the SYN. It's always called after
2592 * we have set up sk->mtu to our own MTU.
2593 *
2594 * We need at minimum to add PAWS support here. Possibly large windows
2595 * as Linux gets deployed on 100Mb/sec networks.
2596 */
2597
2598 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2599 {
2600 unsigned char *ptr;
2601 int length=(th->doff*4)-sizeof(struct tcphdr);
2602 int mss_seen = 0;
2603
2604 ptr = (unsigned char *)(th + 1);
2605
2606 while(length>0)
2607 {
2608 int opcode=*ptr++;
2609 int opsize=*ptr++;
2610 switch(opcode)
2611 {
2612 case TCPOPT_EOL:
2613 return;
2614 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2615 length--;
2616 ptr--; /* the opsize=*ptr++ above was a mistake */
2617 continue;
2618
2619 default:
2620 if(opsize<=2) /* Avoid silly options looping forever */
2621 return;
2622 switch(opcode)
2623 {
2624 case TCPOPT_MSS:
2625 if(opsize==4 && th->syn)
2626 {
2627 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2628 mss_seen = 1;
2629 }
2630 break;
2631 /* Add other options here as people feel the urge to implement stuff like large windows */
2632 }
2633 ptr+=opsize-2;
2634 length-=opsize;
2635 }
2636 }
2637 if (th->syn)
2638 {
2639 if (! mss_seen)
2640 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2641 }
2642 #ifdef CONFIG_INET_PCTCP
2643 sk->mss = min(sk->max_window >> 1, sk->mtu);
2644 #else
2645 sk->mss = min(sk->max_window, sk->mtu);
2646 #endif
2647 }
2648
2649 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2650 {
2651 dst = ntohl(dst);
2652 if (IN_CLASSA(dst))
2653 return htonl(IN_CLASSA_NET);
2654 if (IN_CLASSB(dst))
2655 return htonl(IN_CLASSB_NET);
2656 return htonl(IN_CLASSC_NET);
2657 }
2658
2659 /*
2660 * Default sequence number picking algorithm.
2661 * As close as possible to RFC 793, which
2662 * suggests using a 250kHz clock.
2663 * Further reading shows this assumes 2MB/s networks.
2664 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2665 * That's funny, Linux has one built in! Use it!
2666 */
2667
2668 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2669 {
2670 struct timeval tv;
2671 do_gettimeofday(&tv);
2672 return tv.tv_usec+tv.tv_sec*1000000;
2673 }
2674
2675 /*
2676 * This routine handles a connection request.
2677 * It should make sure we haven't already responded.
2678 * Because of the way BSD works, we have to send a syn/ack now.
2679 * This also means it will be harder to close a socket which is
2680 * listening.
2681 */
2682
2683 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2684 unsigned long daddr, unsigned long saddr,
2685 struct options *opt, struct device *dev, u32 seq)
2686 {
2687 struct sk_buff *buff;
2688 struct tcphdr *t1;
2689 unsigned char *ptr;
2690 struct sock *newsk;
2691 struct tcphdr *th;
2692 struct device *ndev=NULL;
2693 int tmp;
2694 struct rtable *rt;
2695
2696 th = skb->h.th;
2697
2698 /* If the socket is dead, don't accept the connection. */
2699 if (!sk->dead)
2700 {
2701 sk->data_ready(sk,0);
2702 }
2703 else
2704 {
2705 if(sk->debug)
2706 printk("Reset on %p: Connect on dead socket.\n",sk);
2707 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2708 tcp_statistics.TcpAttemptFails++;
2709 kfree_skb(skb, FREE_READ);
2710 return;
2711 }
2712
2713 /*
2714 * Make sure we can accept more. This will prevent a
2715 * flurry of syns from eating up all our memory.
2716 */
2717
2718 if (sk->ack_backlog >= sk->max_ack_backlog)
2719 {
2720 tcp_statistics.TcpAttemptFails++;
2721 kfree_skb(skb, FREE_READ);
2722 return;
2723 }
2724
2725 /*
2726 * We need to build a new sock struct.
2727 * It is sort of bad to have a socket without an inode attached
2728 * to it, but the wake_up's will just wake up the listening socket,
2729 * and if the listening socket is destroyed before this is taken
2730 * off of the queue, this will take care of it.
2731 */
2732
2733 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2734 if (newsk == NULL)
2735 {
2736 /* just ignore the syn. It will get retransmitted. */
2737 tcp_statistics.TcpAttemptFails++;
2738 kfree_skb(skb, FREE_READ);
2739 return;
2740 }
2741
2742 memcpy(newsk, sk, sizeof(*newsk));
2743 skb_queue_head_init(&newsk->write_queue);
2744 skb_queue_head_init(&newsk->receive_queue);
2745 newsk->send_head = NULL;
2746 newsk->send_tail = NULL;
2747 skb_queue_head_init(&newsk->back_log);
2748 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
2749 newsk->rto = TCP_TIMEOUT_INIT;
2750 newsk->mdev = 0;
2751 newsk->max_window = 0;
2752 newsk->cong_window = 1;
2753 newsk->cong_count = 0;
2754 newsk->ssthresh = 0;
2755 newsk->backoff = 0;
2756 newsk->blog = 0;
2757 newsk->intr = 0;
2758 newsk->proc = 0;
2759 newsk->done = 0;
2760 newsk->partial = NULL;
2761 newsk->pair = NULL;
2762 newsk->wmem_alloc = 0;
2763 newsk->rmem_alloc = 0;
2764 newsk->localroute = sk->localroute;
2765
2766 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2767
2768 newsk->err = 0;
2769 newsk->shutdown = 0;
2770 newsk->ack_backlog = 0;
2771 newsk->acked_seq = skb->h.th->seq+1;
2772 newsk->copied_seq = skb->h.th->seq+1;
2773 newsk->fin_seq = skb->h.th->seq;
2774 newsk->state = TCP_SYN_RECV;
2775 newsk->timeout = 0;
2776 newsk->ip_xmit_timeout = 0;
2777 newsk->write_seq = seq;
2778 newsk->window_seq = newsk->write_seq;
2779 newsk->rcv_ack_seq = newsk->write_seq;
2780 newsk->urg_data = 0;
2781 newsk->retransmits = 0;
2782 newsk->linger=0;
2783 newsk->destroy = 0;
2784 init_timer(&newsk->timer);
2785 newsk->timer.data = (unsigned long)newsk;
2786 newsk->timer.function = &net_timer;
2787 init_timer(&newsk->retransmit_timer);
2788 newsk->retransmit_timer.data = (unsigned long)newsk;
2789 newsk->retransmit_timer.function=&retransmit_timer;
2790 newsk->dummy_th.source = skb->h.th->dest;
2791 newsk->dummy_th.dest = skb->h.th->source;
2792
2793 /*
2794 * Swap these two, they are from our point of view.
2795 */
2796
2797 newsk->daddr = saddr;
2798 newsk->saddr = daddr;
2799
2800 put_sock(newsk->num,newsk);
2801 newsk->dummy_th.res1 = 0;
2802 newsk->dummy_th.doff = 6;
2803 newsk->dummy_th.fin = 0;
2804 newsk->dummy_th.syn = 0;
2805 newsk->dummy_th.rst = 0;
2806 newsk->dummy_th.psh = 0;
2807 newsk->dummy_th.ack = 0;
2808 newsk->dummy_th.urg = 0;
2809 newsk->dummy_th.res2 = 0;
2810 newsk->acked_seq = skb->h.th->seq + 1;
2811 newsk->copied_seq = skb->h.th->seq + 1;
2812 newsk->socket = NULL;
2813
2814 /*
2815 * Grab the ttl and tos values and use them
2816 */
2817
2818 newsk->ip_ttl=sk->ip_ttl;
2819 newsk->ip_tos=skb->ip_hdr->tos;
2820
2821 /*
2822 * Use 512 or whatever user asked for
2823 */
2824
2825 /*
2826 * Note use of sk->user_mss, since user has no direct access to newsk
2827 */
2828
2829 rt=ip_rt_route(saddr, NULL,NULL);
2830
2831 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2832 newsk->window_clamp = rt->rt_window;
2833 else
2834 newsk->window_clamp = 0;
2835
2836 if (sk->user_mss)
2837 newsk->mtu = sk->user_mss;
2838 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2839 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2840 else
2841 {
2842 #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */
2843 if ((saddr ^ daddr) & default_mask(saddr))
2844 #else
2845 if ((saddr ^ daddr) & dev->pa_mask)
2846 #endif
2847 newsk->mtu = 576 - HEADER_SIZE;
2848 else
2849 newsk->mtu = MAX_WINDOW;
2850 }
2851
2852 /*
2853 * But not bigger than device MTU
2854 */
2855
2856 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2857
2858 /*
2859 * This will min with what arrived in the packet
2860 */
2861
2862 tcp_options(newsk,skb->h.th);
2863
2864 tcp_cache_zap();
2865
2866 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2867 if (buff == NULL)
2868 {
2869 sk->err = ENOMEM;
2870 newsk->dead = 1;
2871 newsk->state = TCP_CLOSE;
2872 /* And this will destroy it */
2873 release_sock(newsk);
2874 kfree_skb(skb, FREE_READ);
2875 tcp_statistics.TcpAttemptFails++;
2876 return;
2877 }
2878
2879 buff->sk = newsk;
2880 buff->localroute = newsk->localroute;
2881
2882 /*
2883 * Put in the IP header and routing stuff.
2884 */
2885
2886 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2887 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2888
2889 /*
2890 * Something went wrong.
2891 */
2892
2893 if (tmp < 0)
2894 {
2895 sk->err = tmp;
2896 buff->free = 1;
2897 kfree_skb(buff,FREE_WRITE);
2898 newsk->dead = 1;
2899 newsk->state = TCP_CLOSE;
2900 release_sock(newsk);
2901 skb->sk = sk;
2902 kfree_skb(skb, FREE_READ);
2903 tcp_statistics.TcpAttemptFails++;
2904 return;
2905 }
2906
2907 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2908
2909 memcpy(t1, skb->h.th, sizeof(*t1));
2910 buff->h.seq = newsk->write_seq;
2911 /*
2912 * Swap the send and the receive.
2913 */
2914 t1->dest = skb->h.th->source;
2915 t1->source = newsk->dummy_th.source;
2916 t1->seq = ntohl(newsk->write_seq++);
2917 t1->ack = 1;
2918 newsk->window = tcp_select_window(newsk);
2919 newsk->sent_seq = newsk->write_seq;
2920 t1->window = ntohs(newsk->window);
2921 t1->res1 = 0;
2922 t1->res2 = 0;
2923 t1->rst = 0;
2924 t1->urg = 0;
2925 t1->psh = 0;
2926 t1->syn = 1;
2927 t1->ack_seq = ntohl(skb->h.th->seq+1);
2928 t1->doff = sizeof(*t1)/4+1;
2929 ptr = skb_put(buff,4);
2930 ptr[0] = 2;
2931 ptr[1] = 4;
2932 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2933 ptr[3] =(newsk->mtu) & 0xff;
2934
2935 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2936 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2937 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2938 skb->sk = newsk;
2939
2940 /*
2941 * Charge the sock_buff to newsk.
2942 */
2943
2944 sk->rmem_alloc -= skb->truesize;
2945 newsk->rmem_alloc += skb->truesize;
2946
2947 skb_queue_tail(&sk->receive_queue,skb);
2948 sk->ack_backlog++;
2949 release_sock(newsk);
2950 tcp_statistics.TcpOutSegs++;
2951 }
2952
2953
2954 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2955 {
2956 /*
2957 * We need to grab some memory, and put together a FIN,
2958 * and then put it into the queue to be sent.
2959 */
2960
2961 sk->inuse = 1;
2962
2963 if(th_cache_sk==sk)
2964 tcp_cache_zap();
2965 if(sk->state == TCP_LISTEN)
2966 {
2967 /* Special case */
2968 tcp_set_state(sk, TCP_CLOSE);
2969 tcp_close_pending(sk);
2970 release_sock(sk);
2971 return;
2972 }
2973
2974 sk->keepopen = 1;
2975 sk->shutdown = SHUTDOWN_MASK;
2976
2977 if (!sk->dead)
2978 sk->state_change(sk);
2979
2980 if (timeout == 0)
2981 {
2982 struct sk_buff *skb;
2983
2984 /*
2985 * We need to flush the recv. buffs. We do this only on the
2986 * descriptor close, not protocol-sourced closes, because the
2987 * reader process may not have drained the data yet!
2988 */
2989
2990 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2991 kfree_skb(skb, FREE_READ);
2992 /*
2993 * Get rid off any half-completed packets.
2994 */
2995
2996 if (sk->partial)
2997 tcp_send_partial(sk);
2998 }
2999
3000
3001 /*
3002 * Timeout is not the same thing - however the code likes
3003 * to send both the same way (sigh).
3004 */
3005
3006 if(timeout)
3007 {
3008 tcp_set_state(sk, TCP_CLOSE); /* Dead */
3009 }
3010 else
3011 {
3012 if(tcp_close_state(sk,1)==1)
3013 {
3014 tcp_send_fin(sk);
3015 }
3016 }
3017 release_sock(sk);
3018 }
3019
3020
3021 /*
3022 * This routine takes stuff off of the write queue,
3023 * and puts it in the xmit queue. This happens as incoming acks
3024 * open up the remote window for us.
3025 */
3026
3027 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3028 {
3029 struct sk_buff *skb;
3030
3031 /*
3032 * The bytes will have to remain here. In time closedown will
3033 * empty the write queue and all will be happy
3034 */
3035
3036 if(sk->zapped)
3037 return;
3038
3039 /*
3040 * Anything on the transmit queue that fits the window can
3041 * be added providing we are not
3042 *
3043 * a) retransmitting (Nagle's rule)
3044 * b) exceeding our congestion window.
3045 */
3046
3047 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3048 before(skb->h.seq, sk->window_seq + 1) &&
3049 (sk->retransmits == 0 ||
3050 sk->ip_xmit_timeout != TIME_WRITE ||
3051 before(skb->h.seq, sk->rcv_ack_seq + 1))
3052 && sk->packets_out < sk->cong_window)
3053 {
3054 IS_SKB(skb);
3055 skb_unlink(skb);
3056
3057 /*
3058 * See if we really need to send the packet.
3059 */
3060
3061 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3062 {
3063 /*
3064 * This is acked data. We can discard it. This
3065 * cannot currently occur.
3066 */
3067
3068 sk->retransmits = 0;
3069 kfree_skb(skb, FREE_WRITE);
3070 if (!sk->dead)
3071 sk->write_space(sk);
3072 }
3073 else
3074 {
3075 struct tcphdr *th;
3076 struct iphdr *iph;
3077 int size;
3078 /*
3079 * put in the ack seq and window at this point rather than earlier,
3080 * in order to keep them monotonic. We really want to avoid taking
3081 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3082 * Ack and window will in general have changed since this packet was put
3083 * on the write queue.
3084 */
3085 iph = skb->ip_hdr;
3086 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3087 size = skb->len - (((unsigned char *) th) - skb->data);
3088
3089 th->ack_seq = ntohl(sk->acked_seq);
3090 th->window = ntohs(tcp_select_window(sk));
3091
3092 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3093
3094 sk->sent_seq = skb->h.seq;
3095
3096 /*
3097 * IP manages our queue for some crazy reason
3098 */
3099
3100 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3101
3102 /*
3103 * Again we slide the timer wrongly
3104 */
3105
3106 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3107 }
3108 }
3109 }
3110
3111
3112 /*
3113 * This routine deals with incoming acks, but not outgoing ones.
3114 */
3115
3116 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3117 {
3118 u32 ack;
3119 int flag = 0;
3120
3121 /*
3122 * 1 - there was data in packet as well as ack or new data is sent or
3123 * in shutdown state
3124 * 2 - data from retransmit queue was acked and removed
3125 * 4 - window shrunk or data from retransmit queue was acked and removed
3126 */
3127
3128 if(sk->zapped)
3129 return(1); /* Dead, cant ack any more so why bother */
3130
3131 /*
3132 * Have we discovered a larger window
3133 */
3134
3135 ack = ntohl(th->ack_seq);
3136
3137 if (ntohs(th->window) > sk->max_window)
3138 {
3139 sk->max_window = ntohs(th->window);
3140 #ifdef CONFIG_INET_PCTCP
3141 /* Hack because we don't send partial packets to non SWS
3142 handling hosts */
3143 sk->mss = min(sk->max_window>>1, sk->mtu);
3144 #else
3145 sk->mss = min(sk->max_window, sk->mtu);
3146 #endif
3147 }
3148
3149 /*
3150 * We have dropped back to keepalive timeouts. Thus we have
3151 * no retransmits pending.
3152 */
3153
3154 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3155 sk->retransmits = 0;
3156
3157 /*
3158 * If the ack is newer than sent or older than previous acks
3159 * then we can probably ignore it.
3160 */
3161
3162 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3163 {
3164 if(sk->debug)
3165 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3166
3167 /*
3168 * Keepalive processing.
3169 */
3170
3171 if (after(ack, sk->sent_seq))
3172 {
3173 return(0);
3174 }
3175
3176 /*
3177 * Restart the keepalive timer.
3178 */
3179
3180 if (sk->keepopen)
3181 {
3182 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3183 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3184 }
3185 return(1);
3186 }
3187
3188 /*
3189 * If there is data set flag 1
3190 */
3191
3192 if (len != th->doff*4)
3193 flag |= 1;
3194
3195 /*
3196 * See if our window has been shrunk.
3197 */
3198
3199 if (after(sk->window_seq, ack+ntohs(th->window)))
3200 {
3201 /*
3202 * We may need to move packets from the send queue
3203 * to the write queue, if the window has been shrunk on us.
3204 * The RFC says you are not allowed to shrink your window
3205 * like this, but if the other end does, you must be able
3206 * to deal with it.
3207 */
3208 struct sk_buff *skb;
3209 struct sk_buff *skb2;
3210 struct sk_buff *wskb = NULL;
3211
3212 skb2 = sk->send_head;
3213 sk->send_head = NULL;
3214 sk->send_tail = NULL;
3215
3216 /*
3217 * This is an artifact of a flawed concept. We want one
3218 * queue and a smarter send routine when we send all.
3219 */
3220
3221 flag |= 4; /* Window changed */
3222
3223 sk->window_seq = ack + ntohs(th->window);
3224 cli();
3225 while (skb2 != NULL)
3226 {
3227 skb = skb2;
3228 skb2 = skb->link3;
3229 skb->link3 = NULL;
3230 if (after(skb->h.seq, sk->window_seq))
3231 {
3232 if (sk->packets_out > 0)
3233 sk->packets_out--;
3234 /* We may need to remove this from the dev send list. */
3235 if (skb->next != NULL)
3236 {
3237 skb_unlink(skb);
3238 }
3239 /* Now add it to the write_queue. */
3240 if (wskb == NULL)
3241 skb_queue_head(&sk->write_queue,skb);
3242 else
3243 skb_append(wskb,skb);
3244 wskb = skb;
3245 }
3246 else
3247 {
3248 if (sk->send_head == NULL)
3249 {
3250 sk->send_head = skb;
3251 sk->send_tail = skb;
3252 }
3253 else
3254 {
3255 sk->send_tail->link3 = skb;
3256 sk->send_tail = skb;
3257 }
3258 skb->link3 = NULL;
3259 }
3260 }
3261 sti();
3262 }
3263
3264 /*
3265 * Pipe has emptied
3266 */
3267
3268 if (sk->send_tail == NULL || sk->send_head == NULL)
3269 {
3270 sk->send_head = NULL;
3271 sk->send_tail = NULL;
3272 sk->packets_out= 0;
3273 }
3274
3275 /*
3276 * Update the right hand window edge of the host
3277 */
3278
3279 sk->window_seq = ack + ntohs(th->window);
3280
3281 /*
3282 * We don't want too many packets out there.
3283 */
3284
3285 if (sk->ip_xmit_timeout == TIME_WRITE &&
3286 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3287 {
3288 /*
3289 * This is Jacobson's slow start and congestion avoidance.
3290 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3291 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3292 * counter and increment it once every cwnd times. It's possible
3293 * that this should be done only if sk->retransmits == 0. I'm
3294 * interpreting "new data is acked" as including data that has
3295 * been retransmitted but is just now being acked.
3296 */
3297 if (sk->cong_window < sk->ssthresh)
3298 /*
3299 * In "safe" area, increase
3300 */
3301 sk->cong_window++;
3302 else
3303 {
3304 /*
3305 * In dangerous area, increase slowly. In theory this is
3306 * sk->cong_window += 1 / sk->cong_window
3307 */
3308 if (sk->cong_count >= sk->cong_window)
3309 {
3310 sk->cong_window++;
3311 sk->cong_count = 0;
3312 }
3313 else
3314 sk->cong_count++;
3315 }
3316 }
3317
3318 /*
3319 * Remember the highest ack received.
3320 */
3321
3322 sk->rcv_ack_seq = ack;
3323
3324 /*
3325 * If this ack opens up a zero window, clear backoff. It was
3326 * being used to time the probes, and is probably far higher than
3327 * it needs to be for normal retransmission.
3328 */
3329
3330 if (sk->ip_xmit_timeout == TIME_PROBE0)
3331 {
3332 sk->retransmits = 0; /* Our probe was answered */
3333
3334 /*
3335 * Was it a usable window open ?
3336 */
3337
3338 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3339 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3340 {
3341 sk->backoff = 0;
3342
3343 /*
3344 * Recompute rto from rtt. this eliminates any backoff.
3345 */
3346
3347 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3348 if (sk->rto > 120*HZ)
3349 sk->rto = 120*HZ;
3350 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3351 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3352 .2 of a second is going to need huge windows (SIGH) */
3353 sk->rto = 20;
3354 }
3355 }
3356
3357 /*
3358 * See if we can take anything off of the retransmit queue.
3359 */
3360
3361 while(sk->send_head != NULL)
3362 {
3363 /* Check for a bug. */
3364 if (sk->send_head->link3 &&
3365 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3366 printk("INET: tcp.c: *** bug send_list out of order.\n");
3367
3368 /*
3369 * If our packet is before the ack sequence we can
3370 * discard it as it's confirmed to have arrived the other end.
3371 */
3372
3373 if (before(sk->send_head->h.seq, ack+1))
3374 {
3375 struct sk_buff *oskb;
3376 if (sk->retransmits)
3377 {
3378 /*
3379 * We were retransmitting. don't count this in RTT est
3380 */
3381 flag |= 2;
3382
3383 /*
3384 * even though we've gotten an ack, we're still
3385 * retransmitting as long as we're sending from
3386 * the retransmit queue. Keeping retransmits non-zero
3387 * prevents us from getting new data interspersed with
3388 * retransmissions.
3389 */
3390
3391 if (sk->send_head->link3) /* Any more queued retransmits? */
3392 sk->retransmits = 1;
3393 else
3394 sk->retransmits = 0;
3395 }
3396 /*
3397 * Note that we only reset backoff and rto in the
3398 * rtt recomputation code. And that doesn't happen
3399 * if there were retransmissions in effect. So the
3400 * first new packet after the retransmissions is
3401 * sent with the backoff still in effect. Not until
3402 * we get an ack from a non-retransmitted packet do
3403 * we reset the backoff and rto. This allows us to deal
3404 * with a situation where the network delay has increased
3405 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3406 */
3407
3408 /*
3409 * We have one less packet out there.
3410 */
3411
3412 if (sk->packets_out > 0)
3413 sk->packets_out --;
3414 /*
3415 * Wake up the process, it can probably write more.
3416 */
3417 if (!sk->dead)
3418 sk->write_space(sk);
3419 oskb = sk->send_head;
3420
3421 if (!(flag&2)) /* Not retransmitting */
3422 {
3423 long m;
3424
3425 /*
3426 * The following amusing code comes from Jacobson's
3427 * article in SIGCOMM '88. Note that rtt and mdev
3428 * are scaled versions of rtt and mean deviation.
3429 * This is designed to be as fast as possible
3430 * m stands for "measurement".
3431 */
3432
3433 m = jiffies - oskb->when; /* RTT */
3434 if(m<=0)
3435 m=1; /* IS THIS RIGHT FOR <0 ??? */
3436 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3437 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3438 if (m < 0)
3439 m = -m; /* m is now abs(error) */
3440 m -= (sk->mdev >> 2); /* similar update on mdev */
3441 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3442
3443 /*
3444 * Now update timeout. Note that this removes any backoff.
3445 */
3446
3447 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3448 if (sk->rto > 120*HZ)
3449 sk->rto = 120*HZ;
3450 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3451 sk->rto = 20;
3452 sk->backoff = 0;
3453 }
3454 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3455 In this case as we just set it up */
3456 cli();
3457 oskb = sk->send_head;
3458 IS_SKB(oskb);
3459 sk->send_head = oskb->link3;
3460 if (sk->send_head == NULL)
3461 {
3462 sk->send_tail = NULL;
3463 }
3464
3465 /*
3466 * We may need to remove this from the dev send list.
3467 */
3468
3469 if (oskb->next)
3470 skb_unlink(oskb);
3471 sti();
3472 kfree_skb(oskb, FREE_WRITE); /* write. */
3473 if (!sk->dead)
3474 sk->write_space(sk);
3475 }
3476 else
3477 {
3478 break;
3479 }
3480 }
3481
3482 /*
3483 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3484 * returns non-NULL, we complete ignore the timer stuff in the else
3485 * clause. We ought to organize the code so that else clause can
3486 * (should) be executed regardless, possibly moving the PROBE timer
3487 * reset over. The skb_peek() thing should only move stuff to the
3488 * write queue, NOT also manage the timer functions.
3489 */
3490
3491 /*
3492 * Maybe we can take some stuff off of the write queue,
3493 * and put it onto the xmit queue.
3494 */
3495 if (skb_peek(&sk->write_queue) != NULL)
3496 {
3497 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3498 (sk->retransmits == 0 ||
3499 sk->ip_xmit_timeout != TIME_WRITE ||
3500 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3501 && sk->packets_out < sk->cong_window)
3502 {
3503 /*
3504 * Add more data to the send queue.
3505 */
3506 flag |= 1;
3507 tcp_write_xmit(sk);
3508 }
3509 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3510 sk->send_head == NULL &&
3511 sk->ack_backlog == 0 &&
3512 sk->state != TCP_TIME_WAIT)
3513 {
3514 /*
3515 * Data to queue but no room.
3516 */
3517 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3518 }
3519 }
3520 else
3521 {
3522 /*
3523 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3524 * from TCP_CLOSE we don't do anything
3525 *
3526 * from anything else, if there is write data (or fin) pending,
3527 * we use a TIME_WRITE timeout, else if keepalive we reset to
3528 * a KEEPALIVE timeout, else we delete the timer.
3529 *
3530 * We do not set flag for nominal write data, otherwise we may
3531 * force a state where we start to write itsy bitsy tidbits
3532 * of data.
3533 */
3534
3535 switch(sk->state) {
3536 case TCP_TIME_WAIT:
3537 /*
3538 * keep us in TIME_WAIT until we stop getting packets,
3539 * reset the timeout.
3540 */
3541 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3542 break;
3543 case TCP_CLOSE:
3544 /*
3545 * don't touch the timer.
3546 */
3547 break;
3548 default:
3549 /*
3550 * Must check send_head, write_queue, and ack_backlog
3551 * to determine which timeout to use.
3552 */
3553 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3554 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3555 } else if (sk->keepopen) {
3556 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3557 } else {
3558 del_timer(&sk->retransmit_timer);
3559 sk->ip_xmit_timeout = 0;
3560 }
3561 break;
3562 }
3563 }
3564
3565 /*
3566 * We have nothing queued but space to send. Send any partial
3567 * packets immediately (end of Nagle rule application).
3568 */
3569
3570 if (sk->packets_out == 0 && sk->partial != NULL &&
3571 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3572 {
3573 flag |= 1;
3574 tcp_send_partial(sk);
3575 }
3576
3577 /*
3578 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3579 * we are now waiting for an acknowledge to our FIN. The other end is
3580 * already in TIME_WAIT.
3581 *
3582 * Move to TCP_CLOSE on success.
3583 */
3584
3585 if (sk->state == TCP_LAST_ACK)
3586 {
3587 if (!sk->dead)
3588 sk->state_change(sk);
3589 if(sk->debug)
3590 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3591 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3592 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3593 {
3594 flag |= 1;
3595 tcp_set_state(sk,TCP_CLOSE);
3596 sk->shutdown = SHUTDOWN_MASK;
3597 }
3598 }
3599
3600 /*
3601 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3602 *
3603 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3604 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3605 */
3606
3607 if (sk->state == TCP_FIN_WAIT1)
3608 {
3609
3610 if (!sk->dead)
3611 sk->state_change(sk);
3612 if (sk->rcv_ack_seq == sk->write_seq)
3613 {
3614 flag |= 1;
3615 sk->shutdown |= SEND_SHUTDOWN;
3616 tcp_set_state(sk, TCP_FIN_WAIT2);
3617 }
3618 }
3619
3620 /*
3621 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3622 *
3623 * Move to TIME_WAIT
3624 */
3625
3626 if (sk->state == TCP_CLOSING)
3627 {
3628
3629 if (!sk->dead)
3630 sk->state_change(sk);
3631 if (sk->rcv_ack_seq == sk->write_seq)
3632 {
3633 flag |= 1;
3634 tcp_time_wait(sk);
3635 }
3636 }
3637
3638 /*
3639 * Final ack of a three way shake
3640 */
3641
3642 if(sk->state==TCP_SYN_RECV)
3643 {
3644 tcp_set_state(sk, TCP_ESTABLISHED);
3645 tcp_options(sk,th);
3646 sk->dummy_th.dest=th->source;
3647 sk->copied_seq = sk->acked_seq;
3648 if(!sk->dead)
3649 sk->state_change(sk);
3650 if(sk->max_window==0)
3651 {
3652 sk->max_window=32; /* Sanity check */
3653 sk->mss=min(sk->max_window,sk->mtu);
3654 }
3655 }
3656
3657 /*
3658 * I make no guarantees about the first clause in the following
3659 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3660 * what conditions "!flag" would be true. However I think the rest
3661 * of the conditions would prevent that from causing any
3662 * unnecessary retransmission.
3663 * Clearly if the first packet has expired it should be
3664 * retransmitted. The other alternative, "flag&2 && retransmits", is
3665 * harder to explain: You have to look carefully at how and when the
3666 * timer is set and with what timeout. The most recent transmission always
3667 * sets the timer. So in general if the most recent thing has timed
3668 * out, everything before it has as well. So we want to go ahead and
3669 * retransmit some more. If we didn't explicitly test for this
3670 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3671 * would not be true. If you look at the pattern of timing, you can
3672 * show that rto is increased fast enough that the next packet would
3673 * almost never be retransmitted immediately. Then you'd end up
3674 * waiting for a timeout to send each packet on the retransmission
3675 * queue. With my implementation of the Karn sampling algorithm,
3676 * the timeout would double each time. The net result is that it would
3677 * take a hideous amount of time to recover from a single dropped packet.
3678 * It's possible that there should also be a test for TIME_WRITE, but
3679 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3680 * got to be in real retransmission mode.
3681 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3682 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3683 * As long as no further losses occur, this seems reasonable.
3684 */
3685
3686 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3687 (((flag&2) && sk->retransmits) ||
3688 (sk->send_head->when + sk->rto < jiffies)))
3689 {
3690 if(sk->send_head->when + sk->rto < jiffies)
3691 tcp_retransmit(sk,0);
3692 else
3693 {
3694 tcp_do_retransmit(sk, 1);
3695 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3696 }
3697 }
3698
3699 return(1);
3700 }
3701
3702
3703 /*
3704 * Process the FIN bit. This now behaves as it is supposed to work
3705 * and the FIN takes effect when it is validly part of sequence
3706 * space. Not before when we get holes.
3707 *
3708 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3709 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3710 * TIME-WAIT)
3711 *
3712 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3713 * close and we go into CLOSING (and later onto TIME-WAIT)
3714 *
3715 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3716 *
3717 */
3718
3719 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3720 {
3721 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3722
3723 if (!sk->dead)
3724 {
3725 sk->state_change(sk);
3726 sock_wake_async(sk->socket, 1);
3727 }
3728
3729 switch(sk->state)
3730 {
3731 case TCP_SYN_RECV:
3732 case TCP_SYN_SENT:
3733 case TCP_ESTABLISHED:
3734 /*
3735 * move to CLOSE_WAIT, tcp_data() already handled
3736 * sending the ack.
3737 */
3738 tcp_set_state(sk,TCP_CLOSE_WAIT);
3739 if (th->rst)
3740 sk->shutdown = SHUTDOWN_MASK;
3741 break;
3742
3743 case TCP_CLOSE_WAIT:
3744 case TCP_CLOSING:
3745 /*
3746 * received a retransmission of the FIN, do
3747 * nothing.
3748 */
3749 break;
3750 case TCP_TIME_WAIT:
3751 /*
3752 * received a retransmission of the FIN,
3753 * restart the TIME_WAIT timer.
3754 */
3755 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3756 return(0);
3757 case TCP_FIN_WAIT1:
3758 /*
3759 * This case occurs when a simultaneous close
3760 * happens, we must ack the received FIN and
3761 * enter the CLOSING state.
3762 *
3763 * This causes a WRITE timeout, which will either
3764 * move on to TIME_WAIT when we timeout, or resend
3765 * the FIN properly (maybe we get rid of that annoying
3766 * FIN lost hang). The TIME_WRITE code is already correct
3767 * for handling this timeout.
3768 */
3769
3770 if(sk->ip_xmit_timeout != TIME_WRITE)
3771 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3772 tcp_set_state(sk,TCP_CLOSING);
3773 break;
3774 case TCP_FIN_WAIT2:
3775 /*
3776 * received a FIN -- send ACK and enter TIME_WAIT
3777 */
3778 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3779 sk->shutdown|=SHUTDOWN_MASK;
3780 tcp_set_state(sk,TCP_TIME_WAIT);
3781 break;
3782 case TCP_CLOSE:
3783 /*
3784 * already in CLOSE
3785 */
3786 break;
3787 default:
3788 tcp_set_state(sk,TCP_LAST_ACK);
3789
3790 /* Start the timers. */
3791 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3792 return(0);
3793 }
3794
3795 return(0);
3796 }
3797
3798
3799
3800 /*
3801 * This routine handles the data. If there is room in the buffer,
3802 * it will be have already been moved into it. If there is no
3803 * room, then we will just have to discard the packet.
3804 */
3805
3806 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3807 unsigned long saddr, unsigned short len)
3808 {
3809 struct sk_buff *skb1, *skb2;
3810 struct tcphdr *th;
3811 int dup_dumped=0;
3812 u32 new_seq, shut_seq;
3813
3814 th = skb->h.th;
3815 skb_pull(skb,th->doff*4);
3816 skb_trim(skb,len-(th->doff*4));
3817
3818 /*
3819 * The bytes in the receive read/assembly queue has increased. Needed for the
3820 * low memory discard algorithm
3821 */
3822
3823 sk->bytes_rcv += skb->len;
3824
3825 if (skb->len == 0 && !th->fin)
3826 {
3827 /*
3828 * Don't want to keep passing ack's back and forth.
3829 * (someone sent us dataless, boring frame)
3830 */
3831 if (!th->ack)
3832 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3833 kfree_skb(skb, FREE_READ);
3834 return(0);
3835 }
3836
3837 /*
3838 * We no longer have anyone receiving data on this connection.
3839 */
3840
3841 #ifndef TCP_DONT_RST_SHUTDOWN
3842
3843 if(sk->shutdown & RCV_SHUTDOWN)
3844 {
3845 /*
3846 * FIXME: BSD has some magic to avoid sending resets to
3847 * broken 4.2 BSD keepalives. Much to my surprise a few non
3848 * BSD stacks still have broken keepalives so we want to
3849 * cope with it.
3850 */
3851
3852 if(skb->len) /* We don't care if it's just an ack or
3853 a keepalive/window probe */
3854 {
3855 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
3856
3857 /* Do this the way 4.4BSD treats it. Not what I'd
3858 regard as the meaning of the spec but it's what BSD
3859 does and clearly they know everything 8) */
3860
3861 /*
3862 * This is valid because of two things
3863 *
3864 * a) The way tcp_data behaves at the bottom.
3865 * b) A fin takes effect when read not when received.
3866 */
3867
3868 shut_seq=sk->acked_seq+1; /* Last byte */
3869
3870 if(after(new_seq,shut_seq))
3871 {
3872 if(sk->debug)
3873 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3874 sk, new_seq, shut_seq, sk->blog);
3875 if(sk->dead)
3876 {
3877 sk->acked_seq = new_seq + th->fin;
3878 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3879 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3880 tcp_statistics.TcpEstabResets++;
3881 tcp_set_state(sk,TCP_CLOSE);
3882 sk->err = EPIPE;
3883 sk->shutdown = SHUTDOWN_MASK;
3884 kfree_skb(skb, FREE_READ);
3885 return 0;
3886 }
3887 }
3888 }
3889 }
3890
3891 #endif
3892
3893 /*
3894 * Now we have to walk the chain, and figure out where this one
3895 * goes into it. This is set up so that the last packet we received
3896 * will be the first one we look at, that way if everything comes
3897 * in order, there will be no performance loss, and if they come
3898 * out of order we will be able to fit things in nicely.
3899 *
3900 * [AC: This is wrong. We should assume in order first and then walk
3901 * forwards from the first hole based upon real traffic patterns.]
3902 *
3903 */
3904
3905 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
3906 {
3907 skb_queue_head(&sk->receive_queue,skb);
3908 skb1= NULL;
3909 }
3910 else
3911 {
3912 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3913 {
3914 if(sk->debug)
3915 {
3916 printk("skb1=%p :", skb1);
3917 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3918 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3919 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3920 sk->acked_seq);
3921 }
3922
3923 /*
3924 * Optimisation: Duplicate frame or extension of previous frame from
3925 * same sequence point (lost ack case).
3926 * The frame contains duplicate data or replaces a previous frame
3927 * discard the previous frame (safe as sk->inuse is set) and put
3928 * the new one in its place.
3929 */
3930
3931 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3932 {
3933 skb_append(skb1,skb);
3934 skb_unlink(skb1);
3935 kfree_skb(skb1,FREE_READ);
3936 dup_dumped=1;
3937 skb1=NULL;
3938 break;
3939 }
3940
3941 /*
3942 * Found where it fits
3943 */
3944
3945 if (after(th->seq+1, skb1->h.th->seq))
3946 {
3947 skb_append(skb1,skb);
3948 break;
3949 }
3950
3951 /*
3952 * See if we've hit the start. If so insert.
3953 */
3954 if (skb1 == skb_peek(&sk->receive_queue))
3955 {
3956 skb_queue_head(&sk->receive_queue, skb);
3957 break;
3958 }
3959 }
3960 }
3961
3962 /*
3963 * Figure out what the ack value for this frame is
3964 */
3965
3966 th->ack_seq = th->seq + skb->len;
3967 if (th->syn)
3968 th->ack_seq++;
3969 if (th->fin)
3970 th->ack_seq++;
3971
3972 if (before(sk->acked_seq, sk->copied_seq))
3973 {
3974 printk("*** tcp.c:tcp_data bug acked < copied\n");
3975 sk->acked_seq = sk->copied_seq;
3976 }
3977
3978 /*
3979 * Now figure out if we can ack anything. This is very messy because we really want two
3980 * receive queues, a completed and an assembly queue. We also want only one transmit
3981 * queue.
3982 */
3983
3984 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3985 {
3986 if (before(th->seq, sk->acked_seq+1))
3987 {
3988 int newwindow;
3989
3990 if (after(th->ack_seq, sk->acked_seq))
3991 {
3992 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3993 if (newwindow < 0)
3994 newwindow = 0;
3995 sk->window = newwindow;
3996 sk->acked_seq = th->ack_seq;
3997 }
3998 skb->acked = 1;
3999
4000 /*
4001 * When we ack the fin, we do the FIN
4002 * processing.
4003 */
4004
4005 if (skb->h.th->fin)
4006 {
4007 tcp_fin(skb,sk,skb->h.th);
4008 }
4009
4010 for(skb2 = skb->next;
4011 skb2 != (struct sk_buff *)&sk->receive_queue;
4012 skb2 = skb2->next)
4013 {
4014 if (before(skb2->h.th->seq, sk->acked_seq+1))
4015 {
4016 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4017 {
4018 newwindow = sk->window -
4019 (skb2->h.th->ack_seq - sk->acked_seq);
4020 if (newwindow < 0)
4021 newwindow = 0;
4022 sk->window = newwindow;
4023 sk->acked_seq = skb2->h.th->ack_seq;
4024 }
4025 skb2->acked = 1;
4026 /*
4027 * When we ack the fin, we do
4028 * the fin handling.
4029 */
4030 if (skb2->h.th->fin)
4031 {
4032 tcp_fin(skb,sk,skb->h.th);
4033 }
4034
4035 /*
4036 * Force an immediate ack.
4037 */
4038
4039 sk->ack_backlog = sk->max_ack_backlog;
4040 }
4041 else
4042 {
4043 break;
4044 }
4045 }
4046
4047 /*
4048 * This also takes care of updating the window.
4049 * This if statement needs to be simplified.
4050 */
4051 if (!sk->delay_acks ||
4052 sk->ack_backlog >= sk->max_ack_backlog ||
4053 sk->bytes_rcv > sk->max_unacked || th->fin) {
4054 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4055 }
4056 else
4057 {
4058 sk->ack_backlog++;
4059 if(sk->debug)
4060 printk("Ack queued.\n");
4061 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4062 }
4063 }
4064 }
4065
4066 /*
4067 * If we've missed a packet, send an ack.
4068 * Also start a timer to send another.
4069 */
4070
4071 if (!skb->acked)
4072 {
4073
4074 /*
4075 * This is important. If we don't have much room left,
4076 * we need to throw out a few packets so we have a good
4077 * window. Note that mtu is used, not mss, because mss is really
4078 * for the send side. He could be sending us stuff as large as mtu.
4079 */
4080
4081 while (sk->prot->rspace(sk) < sk->mtu)
4082 {
4083 skb1 = skb_peek(&sk->receive_queue);
4084 if (skb1 == NULL)
4085 {
4086 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4087 break;
4088 }
4089
4090 /*
4091 * Don't throw out something that has been acked.
4092 */
4093
4094 if (skb1->acked)
4095 {
4096 break;
4097 }
4098
4099 skb_unlink(skb1);
4100 kfree_skb(skb1, FREE_READ);
4101 }
4102 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4103 sk->ack_backlog++;
4104 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4105 }
4106 else
4107 {
4108 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4109 }
4110
4111 /*
4112 * Now tell the user we may have some data.
4113 */
4114
4115 if (!sk->dead)
4116 {
4117 if(sk->debug)
4118 printk("Data wakeup.\n");
4119 sk->data_ready(sk,0);
4120 }
4121 return(0);
4122 }
4123
4124
4125 /*
4126 * This routine is only called when we have urgent data
4127 * signalled. Its the 'slow' part of tcp_urg. It could be
4128 * moved inline now as tcp_urg is only called from one
4129 * place. We handle URGent data wrong. We have to - as
4130 * BSD still doesn't use the correction from RFC961.
4131 */
4132
4133 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4134 {
4135 u32 ptr = ntohs(th->urg_ptr);
4136
4137 if (ptr)
4138 ptr--;
4139 ptr += th->seq;
4140
4141 /* ignore urgent data that we've already seen and read */
4142 if (after(sk->copied_seq, ptr))
4143 return;
4144
4145 /* do we already have a newer (or duplicate) urgent pointer? */
4146 if (sk->urg_data && !after(ptr, sk->urg_seq))
4147 return;
4148
4149 /* tell the world about our new urgent pointer */
4150 if (sk->proc != 0) {
4151 if (sk->proc > 0) {
4152 kill_proc(sk->proc, SIGURG, 1);
4153 } else {
4154 kill_pg(-sk->proc, SIGURG, 1);
4155 }
4156 }
4157 sk->urg_data = URG_NOTYET;
4158 sk->urg_seq = ptr;
4159 }
4160
4161 /*
4162 * This is the 'fast' part of urgent handling.
4163 */
4164
4165 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4166 unsigned long saddr, unsigned long len)
4167 {
4168 u32 ptr;
4169
4170 /*
4171 * Check if we get a new urgent pointer - normally not
4172 */
4173
4174 if (th->urg)
4175 tcp_check_urg(sk,th);
4176
4177 /*
4178 * Do we wait for any urgent data? - normally not
4179 */
4180
4181 if (sk->urg_data != URG_NOTYET)
4182 return 0;
4183
4184 /*
4185 * Is the urgent pointer pointing into this packet?
4186 */
4187
4188 ptr = sk->urg_seq - th->seq + th->doff*4;
4189 if (ptr >= len)
4190 return 0;
4191
4192 /*
4193 * Ok, got the correct packet, update info
4194 */
4195
4196 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4197 if (!sk->dead)
4198 sk->data_ready(sk,0);
4199 return 0;
4200 }
4201
4202 /*
4203 * This will accept the next outstanding connection.
4204 */
4205
4206 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4207 {
4208 struct sock *newsk;
4209 struct sk_buff *skb;
4210
4211 /*
4212 * We need to make sure that this socket is listening,
4213 * and that it has something pending.
4214 */
4215
4216 if (sk->state != TCP_LISTEN)
4217 {
4218 sk->err = EINVAL;
4219 return(NULL);
4220 }
4221
4222 /* Avoid the race. */
4223 cli();
4224 sk->inuse = 1;
4225
4226 while((skb = tcp_dequeue_established(sk)) == NULL)
4227 {
4228 if (flags & O_NONBLOCK)
4229 {
4230 sti();
4231 release_sock(sk);
4232 sk->err = EAGAIN;
4233 return(NULL);
4234 }
4235
4236 release_sock(sk);
4237 interruptible_sleep_on(sk->sleep);
4238 if (current->signal & ~current->blocked)
4239 {
4240 sti();
4241 sk->err = ERESTARTSYS;
4242 return(NULL);
4243 }
4244 sk->inuse = 1;
4245 }
4246 sti();
4247
4248 /*
4249 * Now all we need to do is return skb->sk.
4250 */
4251
4252 newsk = skb->sk;
4253
4254 kfree_skb(skb, FREE_READ);
4255 sk->ack_backlog--;
4256 release_sock(sk);
4257 return(newsk);
4258 }
4259
4260
4261 /*
4262 * This will initiate an outgoing connection.
4263 */
4264
4265 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4266 {
4267 struct sk_buff *buff;
4268 struct device *dev=NULL;
4269 unsigned char *ptr;
4270 int tmp;
4271 int atype;
4272 struct tcphdr *t1;
4273 struct rtable *rt;
4274
4275 if (sk->state != TCP_CLOSE)
4276 {
4277 return(-EISCONN);
4278 }
4279
4280 if (addr_len < 8)
4281 return(-EINVAL);
4282
4283 if (usin->sin_family && usin->sin_family != AF_INET)
4284 return(-EAFNOSUPPORT);
4285
4286 /*
4287 * connect() to INADDR_ANY means loopback (BSD'ism).
4288 */
4289
4290 if(usin->sin_addr.s_addr==INADDR_ANY)
4291 usin->sin_addr.s_addr=ip_my_addr();
4292
4293 /*
4294 * Don't want a TCP connection going to a broadcast address
4295 */
4296
4297 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4298 return -ENETUNREACH;
4299
4300 sk->inuse = 1;
4301 sk->daddr = usin->sin_addr.s_addr;
4302 sk->write_seq = tcp_init_seq();
4303 sk->window_seq = sk->write_seq;
4304 sk->rcv_ack_seq = sk->write_seq -1;
4305 sk->err = 0;
4306 sk->dummy_th.dest = usin->sin_port;
4307 release_sock(sk);
4308
4309 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4310 if (buff == NULL)
4311 {
4312 return(-ENOMEM);
4313 }
4314 sk->inuse = 1;
4315 buff->sk = sk;
4316 buff->free = 0;
4317 buff->localroute = sk->localroute;
4318
4319
4320 /*
4321 * Put in the IP header and routing stuff.
4322 */
4323
4324 rt=ip_rt_route(sk->daddr, NULL, NULL);
4325
4326
4327 /*
4328 * We need to build the routing stuff from the things saved in skb.
4329 */
4330
4331 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4332 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4333 if (tmp < 0)
4334 {
4335 sk->prot->wfree(sk, buff);
4336 release_sock(sk);
4337 return(-ENETUNREACH);
4338 }
4339
4340 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4341
4342 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4343 t1->seq = ntohl(sk->write_seq++);
4344 sk->sent_seq = sk->write_seq;
4345 buff->h.seq = sk->write_seq;
4346 t1->ack = 0;
4347 t1->window = 2;
4348 t1->res1=0;
4349 t1->res2=0;
4350 t1->rst = 0;
4351 t1->urg = 0;
4352 t1->psh = 0;
4353 t1->syn = 1;
4354 t1->urg_ptr = 0;
4355 t1->doff = 6;
4356 /* use 512 or whatever user asked for */
4357
4358 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4359 sk->window_clamp=rt->rt_window;
4360 else
4361 sk->window_clamp=0;
4362
4363 if (sk->user_mss)
4364 sk->mtu = sk->user_mss;
4365 else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4366 sk->mtu = rt->rt_mss;
4367 else
4368 {
4369 #ifdef CONFIG_INET_SNARL
4370 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4371 #else
4372 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4373 #endif
4374 sk->mtu = 576 - HEADER_SIZE;
4375 else
4376 sk->mtu = MAX_WINDOW;
4377 }
4378 /*
4379 * but not bigger than device MTU
4380 */
4381
4382 if(sk->mtu <32)
4383 sk->mtu = 32; /* Sanity limit */
4384
4385 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4386
4387 /*
4388 * Put in the TCP options to say MTU.
4389 */
4390
4391 ptr = skb_put(buff,4);
4392 ptr[0] = 2;
4393 ptr[1] = 4;
4394 ptr[2] = (sk->mtu) >> 8;
4395 ptr[3] = (sk->mtu) & 0xff;
4396 tcp_send_check(t1, sk->saddr, sk->daddr,
4397 sizeof(struct tcphdr) + 4, sk);
4398
4399 /*
4400 * This must go first otherwise a really quick response will get reset.
4401 */
4402
4403 tcp_cache_zap();
4404 tcp_set_state(sk,TCP_SYN_SENT);
4405 if(rt&&rt->rt_flags&RTF_IRTT)
4406 sk->rto = rt->rt_irtt;
4407 else
4408 sk->rto = TCP_TIMEOUT_INIT;
4409 sk->retransmit_timer.function=&retransmit_timer;
4410 sk->retransmit_timer.data = (unsigned long)sk;
4411 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4412 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */
4413
4414 sk->prot->queue_xmit(sk, dev, buff, 0);
4415 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4416 tcp_statistics.TcpActiveOpens++;
4417 tcp_statistics.TcpOutSegs++;
4418
4419 release_sock(sk);
4420 return(0);
4421 }
4422
4423
4424 /* This functions checks to see if the tcp header is actually acceptable. */
4425 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4426 struct options *opt, unsigned long saddr, struct device *dev)
4427 {
4428 u32 next_seq;
4429
4430 next_seq = len - 4*th->doff;
4431 if (th->fin)
4432 next_seq++;
4433 /* if we have a zero window, we can't have any data in the packet.. */
4434 if (next_seq && !sk->window)
4435 goto ignore_it;
4436 next_seq += th->seq;
4437
4438 /*
4439 * This isn't quite right. sk->acked_seq could be more recent
4440 * than sk->window. This is however close enough. We will accept
4441 * slightly more packets than we should, but it should not cause
4442 * problems unless someone is trying to forge packets.
4443 */
4444
4445 /* have we already seen all of this packet? */
4446 if (!after(next_seq+1, sk->acked_seq))
4447 goto ignore_it;
4448 /* or does it start beyond the window? */
4449 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4450 goto ignore_it;
4451
4452 /* ok, at least part of this packet would seem interesting.. */
4453 return 1;
4454
4455 ignore_it:
4456 if (th->rst)
4457 return 0;
4458
4459 /*
4460 * Send a reset if we get something not ours and we are
4461 * unsynchronized. Note: We don't do anything to our end. We
4462 * are just killing the bogus remote connection then we will
4463 * connect again and it will work (with luck).
4464 */
4465
4466 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4467 {
4468 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4469 return 1;
4470 }
4471
4472 /* Try to resync things. */
4473 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4474 return 0;
4475 }
4476
4477 /*
4478 * When we get a reset we do this.
4479 */
4480
4481 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4482 {
4483 sk->zapped = 1;
4484 sk->err = ECONNRESET;
4485 if (sk->state == TCP_SYN_SENT)
4486 sk->err = ECONNREFUSED;
4487 if (sk->state == TCP_CLOSE_WAIT)
4488 sk->err = EPIPE;
4489 #ifdef TCP_DO_RFC1337
4490 /*
4491 * Time wait assassination protection [RFC1337]
4492 */
4493 if(sk->state!=TCP_TIME_WAIT)
4494 {
4495 tcp_set_state(sk,TCP_CLOSE);
4496 sk->shutdown = SHUTDOWN_MASK;
4497 }
4498 #else
4499 tcp_set_state(sk,TCP_CLOSE);
4500 sk->shutdown = SHUTDOWN_MASK;
4501 #endif
4502 if (!sk->dead)
4503 sk->state_change(sk);
4504 kfree_skb(skb, FREE_READ);
4505 release_sock(sk);
4506 return(0);
4507 }
4508
4509 /*
4510 * A TCP packet has arrived.
4511 * skb->h.raw is the TCP header.
4512 */
4513
4514 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4515 unsigned long daddr, unsigned short len,
4516 unsigned long saddr, int redo, struct inet_protocol * protocol)
4517 {
4518 struct tcphdr *th;
4519 struct sock *sk;
4520 int syn_ok=0;
4521
4522 tcp_statistics.TcpInSegs++;
4523 if(skb->pkt_type!=PACKET_HOST)
4524 {
4525 kfree_skb(skb,FREE_READ);
4526 return(0);
4527 }
4528
4529 th = skb->h.th;
4530
4531 /*
4532 * Find the socket, using the last hit cache if applicable.
4533 */
4534
4535 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4536 sk=(struct sock *)th_cache_sk;
4537 else
4538 {
4539 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4540 th_cache_saddr=saddr;
4541 th_cache_daddr=daddr;
4542 th_cache_dport=th->dest;
4543 th_cache_sport=th->source;
4544 th_cache_sk=sk;
4545 }
4546
4547 /*
4548 * If this socket has got a reset it's to all intents and purposes
4549 * really dead. Count closed sockets as dead.
4550 *
4551 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4552 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4553 * exist so should cause resets as if the port was unreachable.
4554 */
4555
4556 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4557 sk=NULL;
4558
4559 if (!redo)
4560 {
4561 /*
4562 * Pull up the IP header.
4563 */
4564 skb_pull(skb, skb->h.raw-skb->data);
4565 /*
4566 * Try to use the device checksum if provided.
4567 */
4568 if (
4569 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4570 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4571 )
4572 {
4573 skb->sk = NULL;
4574 kfree_skb(skb,FREE_READ);
4575 /*
4576 * We don't release the socket because it was
4577 * never marked in use.
4578 */
4579 return(0);
4580 }
4581 th->seq = ntohl(th->seq);
4582
4583 /* See if we know about the socket. */
4584 if (sk == NULL)
4585 {
4586 /*
4587 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4588 */
4589 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4590 skb->sk = NULL;
4591 /*
4592 * Discard frame
4593 */
4594 kfree_skb(skb, FREE_READ);
4595 return(0);
4596 }
4597
4598 /* skb->len = len;*/
4599 skb->acked = 0;
4600 skb->used = 0;
4601 skb->free = 0;
4602 skb->saddr = daddr;
4603 skb->daddr = saddr;
4604
4605 /* We may need to add it to the backlog here. */
4606 cli();
4607 if (sk->inuse)
4608 {
4609 skb_queue_tail(&sk->back_log, skb);
4610 sti();
4611 return(0);
4612 }
4613 sk->inuse = 1;
4614 sti();
4615 }
4616 else
4617 {
4618 if (sk==NULL)
4619 {
4620 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4621 skb->sk = NULL;
4622 kfree_skb(skb, FREE_READ);
4623 return(0);
4624 }
4625 }
4626
4627
4628 if (!sk->prot)
4629 {
4630 printk("IMPOSSIBLE 3\n");
4631 return(0);
4632 }
4633
4634
4635 /*
4636 * Charge the memory to the socket.
4637 */
4638
4639 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4640 {
4641 kfree_skb(skb, FREE_READ);
4642 release_sock(sk);
4643 return(0);
4644 }
4645
4646 skb->sk=sk;
4647 sk->rmem_alloc += skb->truesize;
4648
4649 /*
4650 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4651 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4652 * compatibility. We also set up variables more thoroughly [Karn notes in the
4653 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4654 */
4655
4656 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4657 {
4658
4659 /*
4660 * Now deal with unusual cases.
4661 */
4662
4663 if(sk->state==TCP_LISTEN)
4664 {
4665 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4666 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4667
4668 /*
4669 * We don't care for RST, and non SYN are absorbed (old segments)
4670 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4671 * netmask on a running connection it can go broadcast. Even Sun's have
4672 * this problem so I'm ignoring it
4673 */
4674
4675 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4676 {
4677 kfree_skb(skb, FREE_READ);
4678 release_sock(sk);
4679 return 0;
4680 }
4681
4682 /*
4683 * Guess we need to make a new socket up
4684 */
4685
4686 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4687
4688 /*
4689 * Now we have several options: In theory there is nothing else
4690 * in the frame. KA9Q has an option to send data with the syn,
4691 * BSD accepts data with the syn up to the [to be] advertised window
4692 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4693 * it, that fits the spec precisely and avoids incompatibilities. It
4694 * would be nice in future to drop through and process the data.
4695 */
4696
4697 release_sock(sk);
4698 return 0;
4699 }
4700
4701 /* retransmitted SYN? */
4702 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4703 {
4704 kfree_skb(skb, FREE_READ);
4705 release_sock(sk);
4706 return 0;
4707 }
4708
4709 /*
4710 * SYN sent means we have to look for a suitable ack and either reset
4711 * for bad matches or go to connected
4712 */
4713
4714 if(sk->state==TCP_SYN_SENT)
4715 {
4716 /* Crossed SYN or previous junk segment */
4717 if(th->ack)
4718 {
4719 /* We got an ack, but it's not a good ack */
4720 if(!tcp_ack(sk,th,saddr,len))
4721 {
4722 /* Reset the ack - its an ack from a
4723 different connection [ th->rst is checked in tcp_reset()] */
4724 tcp_statistics.TcpAttemptFails++;
4725 tcp_reset(daddr, saddr, th,
4726 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4727 kfree_skb(skb, FREE_READ);
4728 release_sock(sk);
4729 return(0);
4730 }
4731 if(th->rst)
4732 return tcp_std_reset(sk,skb);
4733 if(!th->syn)
4734 {
4735 /* A valid ack from a different connection
4736 start. Shouldn't happen but cover it */
4737 kfree_skb(skb, FREE_READ);
4738 release_sock(sk);
4739 return 0;
4740 }
4741 /*
4742 * Ok.. it's good. Set up sequence numbers and
4743 * move to established.
4744 */
4745 syn_ok=1; /* Don't reset this connection for the syn */
4746 sk->acked_seq=th->seq+1;
4747 sk->fin_seq=th->seq;
4748 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4749 tcp_set_state(sk, TCP_ESTABLISHED);
4750 tcp_options(sk,th);
4751 sk->dummy_th.dest=th->source;
4752 sk->copied_seq = sk->acked_seq;
4753 if(!sk->dead)
4754 {
4755 sk->state_change(sk);
4756 sock_wake_async(sk->socket, 0);
4757 }
4758 if(sk->max_window==0)
4759 {
4760 sk->max_window = 32;
4761 sk->mss = min(sk->max_window, sk->mtu);
4762 }
4763 }
4764 else
4765 {
4766 /* See if SYN's cross. Drop if boring */
4767 if(th->syn && !th->rst)
4768 {
4769 /* Crossed SYN's are fine - but talking to
4770 yourself is right out... */
4771 if(sk->saddr==saddr && sk->daddr==daddr &&
4772 sk->dummy_th.source==th->source &&
4773 sk->dummy_th.dest==th->dest)
4774 {
4775 tcp_statistics.TcpAttemptFails++;
4776 return tcp_std_reset(sk,skb);
4777 }
4778 tcp_set_state(sk,TCP_SYN_RECV);
4779
4780 /*
4781 * FIXME:
4782 * Must send SYN|ACK here
4783 */
4784 }
4785 /* Discard junk segment */
4786 kfree_skb(skb, FREE_READ);
4787 release_sock(sk);
4788 return 0;
4789 }
4790 /*
4791 * SYN_RECV with data maybe.. drop through
4792 */
4793 goto rfc_step6;
4794 }
4795
4796 /*
4797 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4798 * a more complex suggestion for fixing these reuse issues in RFC1644
4799 * but not yet ready for general use. Also see RFC1379.
4800 */
4801
4802 #define BSD_TIME_WAIT
4803 #ifdef BSD_TIME_WAIT
4804 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4805 after(th->seq, sk->acked_seq) && !th->rst)
4806 {
4807 u32 seq = sk->write_seq;
4808 if(sk->debug)
4809 printk("Doing a BSD time wait\n");
4810 tcp_statistics.TcpEstabResets++;
4811 sk->rmem_alloc -= skb->truesize;
4812 skb->sk = NULL;
4813 sk->err=ECONNRESET;
4814 tcp_set_state(sk, TCP_CLOSE);
4815 sk->shutdown = SHUTDOWN_MASK;
4816 release_sock(sk);
4817 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4818 if (sk && sk->state==TCP_LISTEN)
4819 {
4820 sk->inuse=1;
4821 skb->sk = sk;
4822 sk->rmem_alloc += skb->truesize;
4823 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4824 release_sock(sk);
4825 return 0;
4826 }
4827 kfree_skb(skb, FREE_READ);
4828 return 0;
4829 }
4830 #endif
4831 }
4832
4833 /*
4834 * We are now in normal data flow (see the step list in the RFC)
4835 * Note most of these are inline now. I'll inline the lot when
4836 * I have time to test it hard and look at what gcc outputs
4837 */
4838
4839 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4840 {
4841 kfree_skb(skb, FREE_READ);
4842 release_sock(sk);
4843 return 0;
4844 }
4845
4846 if(th->rst)
4847 return tcp_std_reset(sk,skb);
4848
4849 /*
4850 * !syn_ok is effectively the state test in RFC793.
4851 */
4852
4853 if(th->syn && !syn_ok)
4854 {
4855 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4856 return tcp_std_reset(sk,skb);
4857 }
4858
4859 /*
4860 * Process the ACK
4861 */
4862
4863
4864 if(th->ack && !tcp_ack(sk,th,saddr,len))
4865 {
4866 /*
4867 * Our three way handshake failed.
4868 */
4869
4870 if(sk->state==TCP_SYN_RECV)
4871 {
4872 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4873 }
4874 kfree_skb(skb, FREE_READ);
4875 release_sock(sk);
4876 return 0;
4877 }
4878
4879 rfc_step6: /* I'll clean this up later */
4880
4881 /*
4882 * Process urgent data
4883 */
4884
4885 if(tcp_urg(sk, th, saddr, len))
4886 {
4887 kfree_skb(skb, FREE_READ);
4888 release_sock(sk);
4889 return 0;
4890 }
4891
4892
4893 /*
4894 * Process the encapsulated data
4895 */
4896
4897 if(tcp_data(skb,sk, saddr, len))
4898 {
4899 kfree_skb(skb, FREE_READ);
4900 release_sock(sk);
4901 return 0;
4902 }
4903
4904 /*
4905 * And done
4906 */
4907
4908 release_sock(sk);
4909 return 0;
4910 }
4911
4912 /*
4913 * This routine sends a packet with an out of date sequence
4914 * number. It assumes the other end will try to ack it.
4915 */
4916
4917 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4918 {
4919 struct sk_buff *buff,*skb;
4920 struct tcphdr *t1;
4921 struct device *dev=NULL;
4922 int tmp;
4923
4924 if (sk->zapped)
4925 return; /* After a valid reset we can send no more */
4926
4927 /*
4928 * Write data can still be transmitted/retransmitted in the
4929 * following states. If any other state is encountered, return.
4930 * [listen/close will never occur here anyway]
4931 */
4932
4933 if (sk->state != TCP_ESTABLISHED &&
4934 sk->state != TCP_CLOSE_WAIT &&
4935 sk->state != TCP_FIN_WAIT1 &&
4936 sk->state != TCP_LAST_ACK &&
4937 sk->state != TCP_CLOSING
4938 )
4939 {
4940 return;
4941 }
4942 if ( before(sk->sent_seq, sk->window_seq) &&
4943 (skb=skb_peek(&sk->write_queue)))
4944 {
4945 /*
4946 * We are probing the opening of a window
4947 * but the window size is != 0
4948 * must have been a result SWS advoidance ( sender )
4949 */
4950
4951 struct iphdr *iph;
4952 struct tcphdr *th;
4953 struct tcphdr *nth;
4954 unsigned long win_size, ow_size;
4955 void * tcp_data_start;
4956
4957 /*
4958 * How many bytes can we send ?
4959 */
4960
4961 win_size = sk->window_seq - sk->sent_seq;
4962
4963 /*
4964 * Recover the buffer pointers
4965 */
4966
4967 iph = (struct iphdr *)skb->ip_hdr;
4968 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4969
4970 /*
4971 * Grab the data for a temporary frame
4972 */
4973
4974 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
4975 (iph->ihl << 2) +
4976 sk->prot->max_header + 15,
4977 1, GFP_ATOMIC);
4978 if ( buff == NULL )
4979 return;
4980
4981 /*
4982 * If we strip the packet on the write queue we must
4983 * be ready to retransmit this one
4984 */
4985
4986 buff->free = /*0*/1;
4987
4988 buff->sk = sk;
4989 buff->localroute = sk->localroute;
4990
4991 /*
4992 * Put headers on the new packet
4993 */
4994
4995 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4996 IPPROTO_TCP, sk->opt, buff->truesize,
4997 sk->ip_tos,sk->ip_ttl);
4998 if (tmp < 0)
4999 {
5000 sk->prot->wfree(sk, buff);
5001 return;
5002 }
5003
5004 /*
5005 * Move the TCP header over
5006 */
5007
5008 buff->dev = dev;
5009
5010 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5011
5012 memcpy(nth, th, th->doff * 4);
5013
5014 /*
5015 * Correct the new header
5016 */
5017
5018 nth->ack = 1;
5019 nth->ack_seq = ntohl(sk->acked_seq);
5020 nth->window = ntohs(tcp_select_window(sk));
5021 nth->check = 0;
5022
5023 /*
5024 * Find the first data byte.
5025 */
5026
5027 tcp_data_start = skb->data + skb->dev->hard_header_len +
5028 (iph->ihl << 2) + th->doff * 4;
5029
5030 /*
5031 * Add it to our new buffer
5032 */
5033 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5034
5035 /*
5036 * Remember our right edge sequence number.
5037 */
5038
5039 buff->h.seq = sk->sent_seq + win_size;
5040 sk->sent_seq = buff->h.seq; /* Hack */
5041 #if 0
5042
5043 /*
5044 * now: shrink the queue head segment
5045 */
5046
5047 th->check = 0;
5048 ow_size = skb->len - win_size -
5049 ((unsigned long) (tcp_data_start - (void *) skb->data));
5050
5051 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5052 skb_trim(skb,skb->len-win_size);
5053 sk->sent_seq += win_size;
5054 th->seq = htonl(sk->sent_seq);
5055 if (th->urg)
5056 {
5057 unsigned short urg_ptr;
5058
5059 urg_ptr = ntohs(th->urg_ptr);
5060 if (urg_ptr <= win_size)
5061 th->urg = 0;
5062 else
5063 {
5064 urg_ptr -= win_size;
5065 th->urg_ptr = htons(urg_ptr);
5066 nth->urg_ptr = htons(win_size);
5067 }
5068 }
5069 #else
5070 if(th->urg && ntohs(th->urg_ptr) < win_size)
5071 nth->urg = 0;
5072 #endif
5073
5074 /*
5075 * Checksum the split buffer
5076 */
5077
5078 tcp_send_check(nth, sk->saddr, sk->daddr,
5079 nth->doff * 4 + win_size , sk);
5080 }
5081 else
5082 {
5083 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5084 if (buff == NULL)
5085 return;
5086
5087 buff->free = 1;
5088 buff->sk = sk;
5089 buff->localroute = sk->localroute;
5090
5091 /*
5092 * Put in the IP header and routing stuff.
5093 */
5094
5095 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5096 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5097 if (tmp < 0)
5098 {
5099 sk->prot->wfree(sk, buff);
5100 return;
5101 }
5102
5103 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5104 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5105
5106 /*
5107 * Use a previous sequence.
5108 * This should cause the other end to send an ack.
5109 */
5110
5111 t1->seq = htonl(sk->sent_seq-1);
5112 t1->ack = 1;
5113 t1->res1= 0;
5114 t1->res2= 0;
5115 t1->rst = 0;
5116 t1->urg = 0;
5117 t1->psh = 0;
5118 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5119 t1->syn = 0;
5120 t1->ack_seq = ntohl(sk->acked_seq);
5121 t1->window = ntohs(tcp_select_window(sk));
5122 t1->doff = sizeof(*t1)/4;
5123 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5124
5125 }
5126
5127 /*
5128 * Send it.
5129 */
5130
5131 sk->prot->queue_xmit(sk, dev, buff, 1);
5132 tcp_statistics.TcpOutSegs++;
5133 }
5134
5135 /*
5136 * A window probe timeout has occurred.
5137 */
5138
5139 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5140 {
5141 if (sk->zapped)
5142 return; /* After a valid reset we can send no more */
5143
5144 tcp_write_wakeup(sk);
5145
5146 sk->backoff++;
5147 sk->rto = min(sk->rto << 1, 120*HZ);
5148 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5149 sk->retransmits++;
5150 sk->prot->retransmits ++;
5151 }
5152
5153 /*
5154 * Socket option code for TCP.
5155 */
5156
5157 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5158 {
5159 int val,err;
5160
5161 if(level!=SOL_TCP)
5162 return ip_setsockopt(sk,level,optname,optval,optlen);
5163
5164 if (optval == NULL)
5165 return(-EINVAL);
5166
5167 err=verify_area(VERIFY_READ, optval, sizeof(int));
5168 if(err)
5169 return err;
5170
5171 val = get_user((int *)optval);
5172
5173 switch(optname)
5174 {
5175 case TCP_MAXSEG:
5176 /*
5177 * values greater than interface MTU won't take effect. however at
5178 * the point when this call is done we typically don't yet know
5179 * which interface is going to be used
5180 */
5181 if(val<1||val>MAX_WINDOW)
5182 return -EINVAL;
5183 sk->user_mss=val;
5184 return 0;
5185 case TCP_NODELAY:
5186 sk->nonagle=(val==0)?0:1;
5187 return 0;
5188 default:
5189 return(-ENOPROTOOPT);
5190 }
5191 }
5192
5193 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5194 {
5195 int val,err;
5196
5197 if(level!=SOL_TCP)
5198 return ip_getsockopt(sk,level,optname,optval,optlen);
5199
5200 switch(optname)
5201 {
5202 case TCP_MAXSEG:
5203 val=sk->user_mss;
5204 break;
5205 case TCP_NODELAY:
5206 val=sk->nonagle;
5207 break;
5208 default:
5209 return(-ENOPROTOOPT);
5210 }
5211 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5212 if(err)
5213 return err;
5214 put_user(sizeof(int),(int *) optlen);
5215
5216 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5217 if(err)
5218 return err;
5219 put_user(val,(int *)optval);
5220
5221 return(0);
5222 }
5223
5224
5225 struct proto tcp_prot = {
5226 sock_wmalloc,
5227 sock_rmalloc,
5228 sock_wfree,
5229 sock_rfree,
5230 sock_rspace,
5231 sock_wspace,
5232 tcp_close,
5233 tcp_read,
5234 tcp_write,
5235 tcp_sendto,
5236 tcp_recvfrom,
5237 ip_build_header,
5238 tcp_connect,
5239 tcp_accept,
5240 ip_queue_xmit,
5241 tcp_retransmit,
5242 tcp_write_wakeup,
5243 tcp_read_wakeup,
5244 tcp_rcv,
5245 tcp_select,
5246 tcp_ioctl,
5247 NULL,
5248 tcp_shutdown,
5249 tcp_setsockopt,
5250 tcp_getsockopt,
5251 128,
5252 0,
5253 "TCP",
5254 0, 0,
5255 {NULL,}
5256 };