1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while sk->inuse=1
26 * and was trying to connect (tcp_err()).
27 * Alan Cox : All icmp error handling was broken
28 * pointers passed where wrong and the
29 * socket was looked up backwards. Nobody
30 * tested any icmp error code obviously.
31 * Alan Cox : tcp_err() now handled properly. It wakes people
32 * on errors. select behaves and the icmp error race
33 * has gone by moving it into sock.c
34 * Alan Cox : tcp_reset() fixed to work for everything not just
35 * packets for unknown sockets.
36 * Alan Cox : tcp option processing.
37 * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong]
38 * Herp Rosmanith : More reset fixes
39 * Alan Cox : No longer acks invalid rst frames. Acking
40 * any kind of RST is right out.
41 * Alan Cox : Sets an ignore me flag on an rst receive
42 * otherwise odd bits of prattle escape still
43 * Alan Cox : Fixed another acking RST frame bug. Should stop
44 * LAN workplace lockups.
45 * Alan Cox : Some tidyups using the new skb list facilities
46 * Alan Cox : sk->keepopen now seems to work
47 * Alan Cox : Pulls options out correctly on accepts
48 * Alan Cox : Fixed assorted sk->rqueue->next errors
49 * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops.
50 * Alan Cox : Tidied tcp_data to avoid a potential nasty.
51 * Alan Cox : Added some better commenting, as the tcp is hard to follow
52 * Alan Cox : Removed incorrect check for 20 * psh
53 * Michael O'Reilly : ack < copied bug fix.
54 * Johannes Stille : Misc tcp fixes (not all in yet).
55 * Alan Cox : FIN with no memory -> CRASH
56 * Alan Cox : Added socket option proto entries. Also added awareness of them to accept.
57 * Alan Cox : Added TCP options (SOL_TCP)
58 * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets.
59 * Alan Cox : Use ip_tos/ip_ttl settings.
60 * Alan Cox : Handle FIN (more) properly (we hope).
61 * Alan Cox : RST frames sent on unsynchronised state ack error/
62 * Alan Cox : Put in missing check for SYN bit.
63 * Alan Cox : Added tcp_select_window() aka NET2E
64 * window non shrink trick.
65 * Alan Cox : Added a couple of small NET2E timer fixes
66 * Charles Hedrick : TCP fixes
67 * Toomas Tamm : TCP window fixes
68 * Alan Cox : Small URG fix to rlogin ^C ack fight
69 * Charles Hedrick : Rewrote most of it to actually work
70 * Linus : Rewrote tcp_read() and URG handling
71 * completely
72 * Gerhard Koerting: Fixed some missing timer handling
73 * Matthew Dillon : Reworked TCP machine states as per RFC
74 * Gerhard Koerting: PC/TCP workarounds
75 * Adam Caldwell : Assorted timer/timing errors
76 * Matthew Dillon : Fixed another RST bug
77 * Alan Cox : Move to kernel side addressing changes.
78 * Alan Cox : Beginning work on TCP fastpathing (not yet usable)
79 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
80 * Alan Cox : TCP fast path debugging
81 * Alan Cox : Window clamping
82 * Michael Riepe : Bug in tcp_check()
83 * Matt Dillon : More TCP improvements and RST bug fixes
84 * Matt Dillon : Yet more small nasties remove from the TCP code
85 * (Be very nice to this man if tcp finally works 100%) 8)
86 * Alan Cox : BSD accept semantics.
87 * Alan Cox : Reset on closedown bug.
88 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
89 * Michael Pall : Handle select() after URG properly in all cases.
90 * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin).
91 * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now.
92 * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api.
93 * Alan Cox : Changed the semantics of sk->socket to
94 * fix a race and a signal problem with
95 * accept() and async I/O.
96 * Alan Cox : Relaxed the rules on tcp_sendto().
97 * Yury Shevchuk : Really fixed accept() blocking problem.
98 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
99 * clients/servers which listen in on
100 * fixed ports.
101 * Alan Cox : Cleaned the above up and shrank it to
102 * a sensible code size.
103 * Alan Cox : Self connect lockup fix.
104 * Alan Cox : No connect to multicast.
105 * Ross Biro : Close unaccepted children on master
106 * socket close.
107 * Alan Cox : Reset tracing code.
108 * Alan Cox : Spurious resets on shutdown.
109 * Alan Cox : Giant 15 minute/60 second timer error
110 * Alan Cox : Small whoops in selecting before an accept.
111 * Alan Cox : Kept the state trace facility since it's
112 * handy for debugging.
113 * Alan Cox : More reset handler fixes.
114 * Alan Cox : Started rewriting the code based on the RFC's
115 * for other useful protocol references see:
116 * Comer, KA9Q NOS, and for a reference on the
117 * difference between specifications and how BSD
118 * works see the 4.4lite source.
119 * A.N.Kuznetsov : Don't time wait on completion of tidy
120 * close.
121 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
122 * Linus Torvalds : Fixed BSD port reuse to work first syn
123 * Alan Cox : Reimplemented timers as per the RFC and using multiple
124 * timers for sanity.
125 * Alan Cox : Small bug fixes, and a lot of new
126 * comments.
127 * Alan Cox : Fixed dual reader crash by locking
128 * the buffers (much like datagram.c)
129 * Alan Cox : Fixed stuck sockets in probe. A probe
130 * now gets fed up of retrying without
131 * (even a no space) answer.
132 * Alan Cox : Extracted closing code better
133 * Alan Cox : Fixed the closing state machine to
134 * resemble the RFC.
135 * Alan Cox : More 'per spec' fixes.
136 * Jorge Cwik : Even faster checksumming.
137 * Alan Cox : tcp_data() doesn't ack illegal PSH
138 * only frames. At least one pc tcp stack
139 * generates them.
140 * Alan Cox : Cache last socket.
141 * Alan Cox : Per route irtt.
142 * Matt Day : Select() match BSD precisely on error
143 *
144 *
145 * To Fix:
146 * Fast path the code. Two things here - fix the window calculation
147 * so it doesn't iterate over the queue, also spot packets with no funny
148 * options arriving in order and process directly.
149 *
150 * Implement RFC 1191 [Path MTU discovery]
151 * Look at the effect of implementing RFC 1337 suggestions and their impact.
152 * Rewrite output state machine to use a single queue and do low window
153 * situations as per the spec (RFC 1122)
154 * Speed up input assembly algorithm.
155 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
156 * could do with it working on IPv4
157 * User settable/learned rtt/max window/mtu
158 * Cope with MTU/device switches when retransmitting in tcp.
159 * Fix the window handling to use PR's new code.
160 *
161 * Change the fundamental structure to a single send queue maintained
162 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
163 * active routes too]). Cut the queue off in tcp_retransmit/
164 * tcp_transmit.
165 * Change the receive queue to assemble as it goes. This lets us
166 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
167 * tcp_data/tcp_read as well as the window shrink crud.
168 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
169 * tcp_queue_skb seem obvious routines to extract.
170 *
171 * This program is free software; you can redistribute it and/or
172 * modify it under the terms of the GNU General Public License
173 * as published by the Free Software Foundation; either version
174 * 2 of the License, or(at your option) any later version.
175 *
176 * Description of States:
177 *
178 * TCP_SYN_SENT sent a connection request, waiting for ack
179 *
180 * TCP_SYN_RECV received a connection request, sent ack,
181 * waiting for final ack in three-way handshake.
182 *
183 * TCP_ESTABLISHED connection established
184 *
185 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
186 * transmission of remaining buffered data
187 *
188 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
189 * to shutdown
190 *
191 * TCP_CLOSING both sides have shutdown but we still have
192 * data we have to finish sending
193 *
194 * TCP_TIME_WAIT timeout to catch resent junk before entering
195 * closed, can only be entered from FIN_WAIT2
196 * or CLOSING. Required because the other end
197 * may not have gotten our last ACK causing it
198 * to retransmit the data packet (which we ignore)
199 *
200 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
201 * us to finish writing our data and to shutdown
202 * (we have to close() to move on to LAST_ACK)
203 *
204 * TCP_LAST_ACK out side has shutdown after remote has
205 * shutdown. There may still be data in our
206 * buffer that we have to finish sending
207 *
208 * TCP_CLOSE socket is finished
209 */
210
211 #include <linux/types.h>
212 #include <linux/sched.h>
213 #include <linux/mm.h>
214 #include <linux/time.h>
215 #include <linux/string.h>
216 #include <linux/config.h>
217 #include <linux/socket.h>
218 #include <linux/sockios.h>
219 #include <linux/termios.h>
220 #include <linux/in.h>
221 #include <linux/fcntl.h>
222 #include <linux/inet.h>
223 #include <linux/netdevice.h>
224 #include <net/snmp.h>
225 #include <net/ip.h>
226 #include <net/protocol.h>
227 #include <net/icmp.h>
228 #include <net/tcp.h>
229 #include <net/arp.h>
230 #include <linux/skbuff.h>
231 #include <net/sock.h>
232 #include <net/route.h>
233 #include <linux/errno.h>
234 #include <linux/timer.h>
235 #include <asm/system.h>
236 #include <asm/segment.h>
237 #include <linux/mm.h>
238 #include <net/checksum.h>
239
240 /*
241 * The MSL timer is the 'normal' timer.
242 */
243
244 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
245
246 #define SEQ_TICK 3
247 unsigned long seq_offset;
248 struct tcp_mib tcp_statistics;
249
250 /*
251 * Cached last hit socket
252 */
253
254 volatile unsigned long th_cache_saddr,th_cache_daddr;
255 volatile unsigned short th_cache_dport, th_cache_sport;
256 volatile struct sock *th_cache_sk;
257
258 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
259 {
260 unsigned long flags;
261 save_flags(flags);
262 cli();
263 th_cache_saddr=0;
264 th_cache_daddr=0;
265 th_cache_dport=0;
266 th_cache_sport=0;
267 th_cache_sk=NULL;
268 restore_flags(flags);
269 }
270
271 static void tcp_close(struct sock *sk, int timeout);
272
273
274 /*
275 * The less said about this the better, but it works and will do for 1.2
276 */
277
278 static struct wait_queue *master_select_wakeup;
279
280 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
281 {
282 if (a < b)
283 return(a);
284 return(b);
285 }
286
287 #undef STATE_TRACE
288
289 #ifdef STATE_TRACE
290 static char *statename[]={
291 "Unused","Established","Syn Sent","Syn Recv",
292 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
293 "Close Wait","Last ACK","Listen","Closing"
294 };
295 #endif
296
297 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
298 {
299 if(sk->state==TCP_ESTABLISHED)
300 tcp_statistics.TcpCurrEstab--;
301 #ifdef STATE_TRACE
302 if(sk->debug)
303 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
304 #endif
305 /* This is a hack but it doesn't occur often and it's going to
306 be a real to fix nicely */
307
308 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
309 {
310 wake_up_interruptible(&master_select_wakeup);
311 }
312 sk->state=state;
313 if(state==TCP_ESTABLISHED)
314 tcp_statistics.TcpCurrEstab++;
315 }
316
317 /*
318 * This routine picks a TCP windows for a socket based on
319 * the following constraints
320 *
321 * 1. The window can never be shrunk once it is offered (RFC 793)
322 * 2. We limit memory per socket
323 *
324 * For now we use NET2E3's heuristic of offering half the memory
325 * we have handy. All is not as bad as this seems however because
326 * of two things. Firstly we will bin packets even within the window
327 * in order to get the data we are waiting for into the memory limit.
328 * Secondly we bin common duplicate forms at receive time
329 * Better heuristics welcome
330 */
331
332 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
333 {
334 int new_window = sk->prot->rspace(sk);
335
336 if(sk->window_clamp)
337 new_window=min(sk->window_clamp,new_window);
338 /*
339 * Two things are going on here. First, we don't ever offer a
340 * window less than min(sk->mss, MAX_WINDOW/2). This is the
341 * receiver side of SWS as specified in RFC1122.
342 * Second, we always give them at least the window they
343 * had before, in order to avoid retracting window. This
344 * is technically allowed, but RFC1122 advises against it and
345 * in practice it causes trouble.
346 *
347 * Fixme: This doesn't correctly handle the case where
348 * new_window > sk->window but not by enough to allow for the
349 * shift in sequence space.
350 */
351 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
352 return(sk->window);
353 return(new_window);
354 }
355
356 /*
357 * Find someone to 'accept'. Must be called with
358 * sk->inuse=1 or cli()
359 */
360
361 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
362 {
363 struct sk_buff *p=skb_peek(&s->receive_queue);
364 if(p==NULL)
365 return NULL;
366 do
367 {
368 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
369 return p;
370 p=p->next;
371 }
372 while(p!=(struct sk_buff *)&s->receive_queue);
373 return NULL;
374 }
375
376 /*
377 * Remove a completed connection and return it. This is used by
378 * tcp_accept() to get connections from the queue.
379 */
380
381 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
382 {
383 struct sk_buff *skb;
384 unsigned long flags;
385 save_flags(flags);
386 cli();
387 skb=tcp_find_established(s);
388 if(skb!=NULL)
389 skb_unlink(skb); /* Take it off the queue */
390 restore_flags(flags);
391 return skb;
392 }
393
394 /*
395 * This routine closes sockets which have been at least partially
396 * opened, but not yet accepted. Currently it is only called by
397 * tcp_close, and timeout mirrors the value there.
398 */
399
400 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
401 {
402 struct sk_buff *skb;
403
404 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
405 {
406 skb->sk->dead=1;
407 tcp_close(skb->sk, 0);
408 kfree_skb(skb, FREE_READ);
409 }
410 return;
411 }
412
413 /*
414 * Enter the time wait state.
415 */
416
417 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
418 {
419 tcp_set_state(sk,TCP_TIME_WAIT);
420 sk->shutdown = SHUTDOWN_MASK;
421 if (!sk->dead)
422 sk->state_change(sk);
423 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
424 }
425
426 /*
427 * A socket has timed out on its send queue and wants to do a
428 * little retransmitting. Currently this means TCP.
429 */
430
431 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
432 {
433 struct sk_buff * skb;
434 struct proto *prot;
435 struct device *dev;
436 int ct=0;
437
438 prot = sk->prot;
439 skb = sk->send_head;
440
441 while (skb != NULL)
442 {
443 struct tcphdr *th;
444 struct iphdr *iph;
445 int size;
446
447 dev = skb->dev;
448 IS_SKB(skb);
449 skb->when = jiffies;
450
451 /*
452 * In general it's OK just to use the old packet. However we
453 * need to use the current ack and window fields. Urg and
454 * urg_ptr could possibly stand to be updated as well, but we
455 * don't keep the necessary data. That shouldn't be a problem,
456 * if the other end is doing the right thing. Since we're
457 * changing the packet, we have to issue a new IP identifier.
458 */
459
460 iph = (struct iphdr *)(skb->data + dev->hard_header_len);
461 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
462 size = skb->len - (((unsigned char *) th) - skb->data);
463
464 /*
465 * Note: We ought to check for window limits here but
466 * currently this is done (less efficiently) elsewhere.
467 * We do need to check for a route change but can't handle
468 * that until we have the new 1.3.x buffers in.
469 *
470 */
471
472 iph->id = htons(ip_id_count++);
473 ip_send_check(iph);
474
475 /*
476 * This is not the right way to handle this. We have to
477 * issue an up to date window and ack report with this
478 * retransmit to keep the odd buggy tcp that relies on
479 * the fact BSD does this happy.
480 * We don't however need to recalculate the entire
481 * checksum, so someone wanting a small problem to play
482 * with might like to implement RFC1141/RFC1624 and speed
483 * this up by avoiding a full checksum.
484 */
485
486 th->ack_seq = ntohl(sk->acked_seq);
487 th->window = ntohs(tcp_select_window(sk));
488 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
489
490 /*
491 * If the interface is (still) up and running, kick it.
492 */
493
494 if (dev->flags & IFF_UP)
495 {
496 /*
497 * If the packet is still being sent by the device/protocol
498 * below then don't retransmit. This is both needed, and good -
499 * especially with connected mode AX.25 where it stops resends
500 * occurring of an as yet unsent anyway frame!
501 * We still add up the counts as the round trip time wants
502 * adjusting.
503 */
504 if (sk && !skb_device_locked(skb))
505 {
506 /* Remove it from any existing driver queue first! */
507 skb_unlink(skb);
508 /* Now queue it */
509 ip_statistics.IpOutRequests++;
510 dev_queue_xmit(skb, dev, sk->priority);
511 }
512 }
513
514 /*
515 * Count retransmissions
516 */
517
518 ct++;
519 sk->prot->retransmits ++;
520
521 /*
522 * Only one retransmit requested.
523 */
524
525 if (!all)
526 break;
527
528 /*
529 * This should cut it off before we send too many packets.
530 */
531
532 if (ct >= sk->cong_window)
533 break;
534 skb = skb->link3;
535 }
536 }
537
538 /*
539 * Reset the retransmission timer
540 */
541
542 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
543 {
544 del_timer(&sk->retransmit_timer);
545 sk->ip_xmit_timeout = why;
546 if((int)when < 0)
547 {
548 when=3;
549 printk("Error: Negative timer in xmit_timer\n");
550 }
551 sk->retransmit_timer.expires=when;
552 add_timer(&sk->retransmit_timer);
553 }
554
555 /*
556 * This is the normal code called for timeouts. It does the retransmission
557 * and then does backoff. tcp_do_retransmit is separated out because
558 * tcp_ack needs to send stuff from the retransmit queue without
559 * initiating a backoff.
560 */
561
562
563 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
564 {
565 tcp_do_retransmit(sk, all);
566
567 /*
568 * Increase the timeout each time we retransmit. Note that
569 * we do not increase the rtt estimate. rto is initialized
570 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
571 * that doubling rto each time is the least we can get away with.
572 * In KA9Q, Karn uses this for the first few times, and then
573 * goes to quadratic. netBSD doubles, but only goes up to *64,
574 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
575 * defined in the protocol as the maximum possible RTT. I guess
576 * we'll have to use something other than TCP to talk to the
577 * University of Mars.
578 *
579 * PAWS allows us longer timeouts and large windows, so once
580 * implemented ftp to mars will work nicely. We will have to fix
581 * the 120 second clamps though!
582 */
583
584 sk->retransmits++;
585 sk->backoff++;
586 sk->rto = min(sk->rto << 1, 120*HZ);
587 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
588 }
589
590
591 /*
592 * A timer event has trigger a tcp retransmit timeout. The
593 * socket xmit queue is ready and set up to send. Because
594 * the ack receive code keeps the queue straight we do
595 * nothing clever here.
596 */
597
598 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
599 {
600 if (all)
601 {
602 tcp_retransmit_time(sk, all);
603 return;
604 }
605
606 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
607 /* sk->ssthresh in theory can be zero. I guess that's OK */
608 sk->cong_count = 0;
609
610 sk->cong_window = 1;
611
612 /* Do the actual retransmit. */
613 tcp_retransmit_time(sk, all);
614 }
615
616 /*
617 * A write timeout has occurred. Process the after effects.
618 */
619
620 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
621 {
622 /*
623 * Look for a 'soft' timeout.
624 */
625 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
626 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
627 {
628 /*
629 * Attempt to recover if arp has changed (unlikely!) or
630 * a route has shifted (not supported prior to 1.3).
631 */
632 arp_destroy (sk->daddr, 0);
633 /*ip_route_check (sk->daddr);*/
634 }
635 /*
636 * Has it gone just too far ?
637 */
638 if (sk->retransmits > TCP_RETR2)
639 {
640 sk->err = ETIMEDOUT;
641 sk->error_report(sk);
642 del_timer(&sk->retransmit_timer);
643 /*
644 * Time wait the socket
645 */
646 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
647 {
648 tcp_set_state(sk,TCP_TIME_WAIT);
649 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
650 }
651 else
652 {
653 /*
654 * Clean up time.
655 */
656 tcp_set_state(sk, TCP_CLOSE);
657 return 0;
658 }
659 }
660 return 1;
661 }
662
663 /*
664 * The TCP retransmit timer. This lacks a few small details.
665 *
666 * 1. An initial rtt timeout on the probe0 should cause what we can
667 * of the first write queue buffer to be split and sent.
668 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
669 * ETIMEDOUT if we know an additional 'soft' error caused this.
670 * tcp_err should save a 'soft error' for us.
671 */
672
673 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
674 {
675 struct sock *sk = (struct sock*)data;
676 int why = sk->ip_xmit_timeout;
677
678 /*
679 * only process if socket is not in use
680 */
681
682 cli();
683 if (sk->inuse || in_bh)
684 {
685 /* Try again in 1 second */
686 sk->retransmit_timer.expires = HZ;
687 add_timer(&sk->retransmit_timer);
688 sti();
689 return;
690 }
691
692 sk->inuse = 1;
693 sti();
694
695 /* Always see if we need to send an ack. */
696
697 if (sk->ack_backlog && !sk->zapped)
698 {
699 sk->prot->read_wakeup (sk);
700 if (! sk->dead)
701 sk->data_ready(sk,0);
702 }
703
704 /* Now we need to figure out why the socket was on the timer. */
705
706 switch (why)
707 {
708 /* Window probing */
709 case TIME_PROBE0:
710 tcp_send_probe0(sk);
711 tcp_write_timeout(sk);
712 break;
713 /* Retransmitting */
714 case TIME_WRITE:
715 /* It could be we got here because we needed to send an ack.
716 * So we need to check for that.
717 */
718 {
719 struct sk_buff *skb;
720 unsigned long flags;
721
722 save_flags(flags);
723 cli();
724 skb = sk->send_head;
725 if (!skb)
726 {
727 restore_flags(flags);
728 }
729 else
730 {
731 /*
732 * Kicked by a delayed ack. Reset timer
733 * correctly now
734 */
735 if (jiffies < skb->when + sk->rto)
736 {
737 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
738 restore_flags(flags);
739 break;
740 }
741 restore_flags(flags);
742 /*
743 * Retransmission
744 */
745 sk->prot->retransmit (sk, 0);
746 tcp_write_timeout(sk);
747 }
748 break;
749 }
750 /* Sending Keepalives */
751 case TIME_KEEPOPEN:
752 /*
753 * this reset_timer() call is a hack, this is not
754 * how KEEPOPEN is supposed to work.
755 */
756 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
757
758 /* Send something to keep the connection open. */
759 if (sk->prot->write_wakeup)
760 sk->prot->write_wakeup (sk);
761 sk->retransmits++;
762 tcp_write_timeout(sk);
763 break;
764 default:
765 printk ("rexmit_timer: timer expired - reason unknown\n");
766 break;
767 }
768 release_sock(sk);
769 }
770
771 /*
772 * This routine is called by the ICMP module when it gets some
773 * sort of error condition. If err < 0 then the socket should
774 * be closed and the error returned to the user. If err > 0
775 * it's just the icmp type << 8 | icmp code. After adjustment
776 * header points to the first 8 bytes of the tcp header. We need
777 * to find the appropriate port.
778 */
779
780 void tcp_err(int err, unsigned char *header, unsigned long daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
781 unsigned long saddr, struct inet_protocol *protocol)
782 {
783 struct tcphdr *th;
784 struct sock *sk;
785 struct iphdr *iph=(struct iphdr *)header;
786
787 header+=4*iph->ihl;
788
789
790 th =(struct tcphdr *)header;
791 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
792
793 if (sk == NULL)
794 return;
795
796 if(err<0)
797 {
798 sk->err = -err;
799 sk->error_report(sk);
800 return;
801 }
802
803 if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8))
804 {
805 /*
806 * FIXME:
807 * For now we will just trigger a linear backoff.
808 * The slow start code should cause a real backoff here.
809 */
810 if (sk->cong_window > 4)
811 sk->cong_window--;
812 return;
813 }
814
815 /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */
816
817 /*
818 * If we've already connected we will keep trying
819 * until we time out, or the user gives up.
820 */
821
822 if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT)
823 {
824 if (sk->state == TCP_SYN_SENT)
825 {
826 tcp_statistics.TcpAttemptFails++;
827 tcp_set_state(sk,TCP_CLOSE);
828 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
829 }
830 sk->err = icmp_err_convert[err & 0xff].errno;
831 }
832 return;
833 }
834
835
836 /*
837 * Walk down the receive queue counting readable data until we hit the end or we find a gap
838 * in the received data queue (ie a frame missing that needs sending to us). Not
839 * sorting using two queues as data arrives makes life so much harder.
840 */
841
842 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
843 {
844 unsigned long counted;
845 unsigned long amount;
846 struct sk_buff *skb;
847 int sum;
848 unsigned long flags;
849
850 if(sk && sk->debug)
851 printk("tcp_readable: %p - ",sk);
852
853 save_flags(flags);
854 cli();
855 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
856 {
857 restore_flags(flags);
858 if(sk && sk->debug)
859 printk("empty\n");
860 return(0);
861 }
862
863 counted = sk->copied_seq; /* Where we are at the moment */
864 amount = 0;
865
866 /*
867 * Do until a push or until we are out of data.
868 */
869
870 do
871 {
872 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
873 break;
874 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
875 if (skb->h.th->syn)
876 sum++;
877 if (sum > 0)
878 { /* Add it up, move on */
879 amount += sum;
880 if (skb->h.th->syn)
881 amount--;
882 counted += sum;
883 }
884 /*
885 * Don't count urg data ... but do it in the right place!
886 * Consider: "old_data (ptr is here) URG PUSH data"
887 * The old code would stop at the first push because
888 * it counted the urg (amount==1) and then does amount--
889 * *after* the loop. This means tcp_readable() always
890 * returned zero if any URG PUSH was in the queue, even
891 * though there was normal data available. If we subtract
892 * the urg data right here, we even get it to work for more
893 * than one URG PUSH skb without normal data.
894 * This means that select() finally works now with urg data
895 * in the queue. Note that rlogin was never affected
896 * because it doesn't use select(); it uses two processes
897 * and a blocking read(). And the queue scan in tcp_read()
898 * was correct. Mike <pall@rz.uni-karlsruhe.de>
899 */
900 if (skb->h.th->urg)
901 amount--; /* don't count urg data */
902 if (amount && skb->h.th->psh) break;
903 skb = skb->next;
904 }
905 while(skb != (struct sk_buff *)&sk->receive_queue);
906
907 restore_flags(flags);
908 if(sk->debug)
909 printk("got %lu bytes.\n",amount);
910 return(amount);
911 }
912
913 /*
914 * LISTEN is a special case for select..
915 */
916 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
917 {
918 if (sel_type == SEL_IN) {
919 int retval;
920
921 sk->inuse = 1;
922 retval = (tcp_find_established(sk) != NULL);
923 release_sock(sk);
924 if (!retval)
925 select_wait(&master_select_wakeup,wait);
926 return retval;
927 }
928 return 0;
929 }
930
931
932 /*
933 * Wait for a TCP event.
934 *
935 * Note that we don't need to set "sk->inuse", as the upper select layers
936 * take care of normal races (between the test and the event) and we don't
937 * go look at any of the socket buffers directly.
938 */
939 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
940 {
941 if (sk->state == TCP_LISTEN)
942 return tcp_listen_select(sk, sel_type, wait);
943
944 switch(sel_type) {
945 case SEL_IN:
946 if (sk->err)
947 return 1;
948 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
949 break;
950
951 if (sk->shutdown & RCV_SHUTDOWN)
952 return 1;
953
954 if (sk->acked_seq == sk->copied_seq)
955 break;
956
957 if (sk->urg_seq != sk->copied_seq ||
958 sk->acked_seq != sk->copied_seq+1 ||
959 sk->urginline || !sk->urg_data)
960 return 1;
961 break;
962
963 case SEL_OUT:
964 if (sk->err)
965 return 1;
966 if (sk->shutdown & SEND_SHUTDOWN)
967 return 0;
968 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
969 break;
970 /*
971 * This is now right thanks to a small fix
972 * by Matt Dillon.
973 */
974
975 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
976 break;
977 return 1;
978
979 case SEL_EX:
980 if (sk->urg_data)
981 return 1;
982 break;
983 }
984 select_wait(sk->sleep, wait);
985 return 0;
986 }
987
988 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
989 {
990 int err;
991 switch(cmd)
992 {
993
994 case TIOCINQ:
995 #ifdef FIXME /* FIXME: */
996 case FIONREAD:
997 #endif
998 {
999 unsigned long amount;
1000
1001 if (sk->state == TCP_LISTEN)
1002 return(-EINVAL);
1003
1004 sk->inuse = 1;
1005 amount = tcp_readable(sk);
1006 release_sock(sk);
1007 err=verify_area(VERIFY_WRITE,(void *)arg,
1008 sizeof(unsigned long));
1009 if(err)
1010 return err;
1011 put_fs_long(amount,(unsigned long *)arg);
1012 return(0);
1013 }
1014 case SIOCATMARK:
1015 {
1016 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1017
1018 err = verify_area(VERIFY_WRITE,(void *) arg,
1019 sizeof(unsigned long));
1020 if (err)
1021 return err;
1022 put_fs_long(answ,(int *) arg);
1023 return(0);
1024 }
1025 case TIOCOUTQ:
1026 {
1027 unsigned long amount;
1028
1029 if (sk->state == TCP_LISTEN) return(-EINVAL);
1030 amount = sk->prot->wspace(sk);
1031 err=verify_area(VERIFY_WRITE,(void *)arg,
1032 sizeof(unsigned long));
1033 if(err)
1034 return err;
1035 put_fs_long(amount,(unsigned long *)arg);
1036 return(0);
1037 }
1038 default:
1039 return(-EINVAL);
1040 }
1041 }
1042
1043
1044 /*
1045 * This routine computes a TCP checksum.
1046 *
1047 * Modified January 1995 from a go-faster DOS routine by
1048 * Jorge Cwik <jorge@laser.satlink.net>
1049 */
1050
1051 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1052 unsigned long saddr, unsigned long daddr)
1053 {
1054 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,
1055 csum_partial((char *)th,len,0));
1056 }
1057
1058
1059
1060 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1061 unsigned long daddr, int len, struct sock *sk)
1062 {
1063 th->check = 0;
1064 th->check = tcp_check(th, len, saddr, daddr);
1065 return;
1066 }
1067
1068 /*
1069 * This is the main buffer sending routine. We queue the buffer
1070 * having checked it is sane seeming.
1071 */
1072
1073 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1074 {
1075 int size;
1076 struct tcphdr * th = skb->h.th;
1077
1078 /*
1079 * length of packet (not counting length of pre-tcp headers)
1080 */
1081
1082 size = skb->len - ((unsigned char *) th - skb->data);
1083
1084 /*
1085 * Sanity check it..
1086 */
1087
1088 if (size < sizeof(struct tcphdr) || size > skb->len)
1089 {
1090 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1091 skb, skb->data, th, skb->len);
1092 kfree_skb(skb, FREE_WRITE);
1093 return;
1094 }
1095
1096 /*
1097 * If we have queued a header size packet.. (these crash a few
1098 * tcp stacks if ack is not set)
1099 */
1100
1101 if (size == sizeof(struct tcphdr))
1102 {
1103 /* If it's got a syn or fin it's notionally included in the size..*/
1104 if(!th->syn && !th->fin)
1105 {
1106 printk("tcp_send_skb: attempt to queue a bogon.\n");
1107 kfree_skb(skb,FREE_WRITE);
1108 return;
1109 }
1110 }
1111
1112 /*
1113 * Actual processing.
1114 */
1115
1116 tcp_statistics.TcpOutSegs++;
1117 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1118
1119 /*
1120 * We must queue if
1121 *
1122 * a) The right edge of this frame exceeds the window
1123 * b) We are retransmitting (Nagle's rule)
1124 * c) We have too many packets 'in flight'
1125 */
1126
1127 if (after(skb->h.seq, sk->window_seq) ||
1128 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1129 sk->packets_out >= sk->cong_window)
1130 {
1131 /* checksum will be supplied by tcp_write_xmit. So
1132 * we shouldn't need to set it at all. I'm being paranoid */
1133 th->check = 0;
1134 if (skb->next != NULL)
1135 {
1136 printk("tcp_send_partial: next != NULL\n");
1137 skb_unlink(skb);
1138 }
1139 skb_queue_tail(&sk->write_queue, skb);
1140
1141 /*
1142 * If we don't fit we have to start the zero window
1143 * probes. This is broken - we really need to do a partial
1144 * send _first_ (This is what causes the Cisco and PC/TCP
1145 * grief).
1146 */
1147
1148 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1149 sk->send_head == NULL && sk->ack_backlog == 0)
1150 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1151 }
1152 else
1153 {
1154 /*
1155 * This is going straight out
1156 */
1157
1158 th->ack_seq = ntohl(sk->acked_seq);
1159 th->window = ntohs(tcp_select_window(sk));
1160
1161 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1162
1163 sk->sent_seq = sk->write_seq;
1164
1165 /*
1166 * This is mad. The tcp retransmit queue is put together
1167 * by the ip layer. This causes half the problems with
1168 * unroutable FIN's and other things.
1169 */
1170
1171 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1172
1173 /*
1174 * Set for next retransmit based on expected ACK time.
1175 * FIXME: We set this every time which means our
1176 * retransmits are really about a window behind.
1177 */
1178
1179 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1180 }
1181 }
1182
1183 /*
1184 * Locking problems lead us to a messy situation where we can have
1185 * multiple partially complete buffers queued up. This is really bad
1186 * as we don't want to be sending partial buffers. Fix this with
1187 * a semaphore or similar to lock tcp_write per socket.
1188 *
1189 * These routines are pretty self descriptive.
1190 */
1191
1192 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1193 {
1194 struct sk_buff * skb;
1195 unsigned long flags;
1196
1197 save_flags(flags);
1198 cli();
1199 skb = sk->partial;
1200 if (skb) {
1201 sk->partial = NULL;
1202 del_timer(&sk->partial_timer);
1203 }
1204 restore_flags(flags);
1205 return skb;
1206 }
1207
1208 /*
1209 * Empty the partial queue
1210 */
1211
1212 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1213 {
1214 struct sk_buff *skb;
1215
1216 if (sk == NULL)
1217 return;
1218 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1219 tcp_send_skb(sk, skb);
1220 }
1221
1222 /*
1223 * Queue a partial frame
1224 */
1225
1226 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1227 {
1228 struct sk_buff * tmp;
1229 unsigned long flags;
1230
1231 save_flags(flags);
1232 cli();
1233 tmp = sk->partial;
1234 if (tmp)
1235 del_timer(&sk->partial_timer);
1236 sk->partial = skb;
1237 init_timer(&sk->partial_timer);
1238 /*
1239 * Wait up to 1 second for the buffer to fill.
1240 */
1241 sk->partial_timer.expires = HZ;
1242 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1243 sk->partial_timer.data = (unsigned long) sk;
1244 add_timer(&sk->partial_timer);
1245 restore_flags(flags);
1246 if (tmp)
1247 tcp_send_skb(sk, tmp);
1248 }
1249
1250
1251 /*
1252 * This routine sends an ack and also updates the window.
1253 */
1254
1255 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1256 struct sock *sk,
1257 struct tcphdr *th, unsigned long daddr)
1258 {
1259 struct sk_buff *buff;
1260 struct tcphdr *t1;
1261 struct device *dev = NULL;
1262 int tmp;
1263
1264 if(sk->zapped)
1265 return; /* We have been reset, we may not send again */
1266
1267 /*
1268 * We need to grab some memory, and put together an ack,
1269 * and then put it into the queue to be sent.
1270 */
1271
1272 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1273 if (buff == NULL)
1274 {
1275 /*
1276 * Force it to send an ack. We don't have to do this
1277 * (ACK is unreliable) but it's much better use of
1278 * bandwidth on slow links to send a spare ack than
1279 * resend packets.
1280 */
1281
1282 sk->ack_backlog++;
1283 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1284 {
1285 reset_xmit_timer(sk, TIME_WRITE, HZ);
1286 }
1287 return;
1288 }
1289
1290 /*
1291 * Assemble a suitable TCP frame
1292 */
1293
1294 buff->len = sizeof(struct tcphdr);
1295 buff->sk = sk;
1296 buff->localroute = sk->localroute;
1297 t1 =(struct tcphdr *) buff->data;
1298
1299 /*
1300 * Put in the IP header and routing stuff.
1301 */
1302
1303 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1304 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1305 if (tmp < 0)
1306 {
1307 buff->free = 1;
1308 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1309 return;
1310 }
1311 buff->len += tmp;
1312 t1 =(struct tcphdr *)((char *)t1 +tmp);
1313
1314 memcpy(t1, th, sizeof(*t1));
1315
1316 /*
1317 * Swap the send and the receive.
1318 */
1319
1320 t1->dest = th->source;
1321 t1->source = th->dest;
1322 t1->seq = ntohl(sequence);
1323 t1->ack = 1;
1324 sk->window = tcp_select_window(sk);
1325 t1->window = ntohs(sk->window);
1326 t1->res1 = 0;
1327 t1->res2 = 0;
1328 t1->rst = 0;
1329 t1->urg = 0;
1330 t1->syn = 0;
1331 t1->psh = 0;
1332 t1->fin = 0;
1333
1334 /*
1335 * If we have nothing queued for transmit and the transmit timer
1336 * is on we are just doing an ACK timeout and need to switch
1337 * to a keepalive.
1338 */
1339
1340 if (ack == sk->acked_seq)
1341 {
1342 sk->ack_backlog = 0;
1343 sk->bytes_rcv = 0;
1344 sk->ack_timed = 0;
1345 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1346 && sk->ip_xmit_timeout == TIME_WRITE)
1347 {
1348 if(sk->keepopen) {
1349 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1350 } else {
1351 delete_timer(sk);
1352 }
1353 }
1354 }
1355
1356 /*
1357 * Fill in the packet and send it
1358 */
1359
1360 t1->ack_seq = ntohl(ack);
1361 t1->doff = sizeof(*t1)/4;
1362 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1363 if (sk->debug)
1364 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1365 tcp_statistics.TcpOutSegs++;
1366 sk->prot->queue_xmit(sk, dev, buff, 1);
1367 }
1368
1369
1370 /*
1371 * This routine builds a generic TCP header.
1372 */
1373
1374 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1375 {
1376
1377 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1378 th->seq = htonl(sk->write_seq);
1379 th->psh =(push == 0) ? 1 : 0;
1380 th->doff = sizeof(*th)/4;
1381 th->ack = 1;
1382 th->fin = 0;
1383 sk->ack_backlog = 0;
1384 sk->bytes_rcv = 0;
1385 sk->ack_timed = 0;
1386 th->ack_seq = htonl(sk->acked_seq);
1387 sk->window = tcp_select_window(sk);
1388 th->window = htons(sk->window);
1389
1390 return(sizeof(*th));
1391 }
1392
1393 /*
1394 * This routine copies from a user buffer into a socket,
1395 * and starts the transmit system.
1396 */
1397
1398 static int tcp_write(struct sock *sk, unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1399 int len, int nonblock, unsigned flags)
1400 {
1401 int copied = 0;
1402 int copy;
1403 int tmp;
1404 struct sk_buff *skb;
1405 struct sk_buff *send_tmp;
1406 unsigned char *buff;
1407 struct proto *prot;
1408 struct device *dev = NULL;
1409
1410 sk->inuse=1;
1411 prot = sk->prot;
1412 while(len > 0)
1413 {
1414 if (sk->err)
1415 { /* Stop on an error */
1416 release_sock(sk);
1417 if (copied)
1418 return(copied);
1419 tmp = -sk->err;
1420 sk->err = 0;
1421 return(tmp);
1422 }
1423
1424 /*
1425 * First thing we do is make sure that we are established.
1426 */
1427
1428 if (sk->shutdown & SEND_SHUTDOWN)
1429 {
1430 release_sock(sk);
1431 sk->err = EPIPE;
1432 if (copied)
1433 return(copied);
1434 sk->err = 0;
1435 return(-EPIPE);
1436 }
1437
1438 /*
1439 * Wait for a connection to finish.
1440 */
1441
1442 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1443 {
1444 if (sk->err)
1445 {
1446 release_sock(sk);
1447 if (copied)
1448 return(copied);
1449 tmp = -sk->err;
1450 sk->err = 0;
1451 return(tmp);
1452 }
1453
1454 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1455 {
1456 release_sock(sk);
1457 if (copied)
1458 return(copied);
1459
1460 if (sk->err)
1461 {
1462 tmp = -sk->err;
1463 sk->err = 0;
1464 return(tmp);
1465 }
1466
1467 if (sk->keepopen)
1468 {
1469 send_sig(SIGPIPE, current, 0);
1470 }
1471 return(-EPIPE);
1472 }
1473
1474 if (nonblock || copied)
1475 {
1476 release_sock(sk);
1477 if (copied)
1478 return(copied);
1479 return(-EAGAIN);
1480 }
1481
1482 release_sock(sk);
1483 cli();
1484
1485 if (sk->state != TCP_ESTABLISHED &&
1486 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1487 {
1488 interruptible_sleep_on(sk->sleep);
1489 if (current->signal & ~current->blocked)
1490 {
1491 sti();
1492 if (copied)
1493 return(copied);
1494 return(-ERESTARTSYS);
1495 }
1496 }
1497 sk->inuse = 1;
1498 sti();
1499 }
1500
1501 /*
1502 * The following code can result in copy <= if sk->mss is ever
1503 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1504 * sk->mtu is constant once SYN processing is finished. I.e. we
1505 * had better not get here until we've seen his SYN and at least one
1506 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1507 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1508 * non-decreasing. Note that any ioctl to set user_mss must be done
1509 * before the exchange of SYN's. If the initial ack from the other
1510 * end has a window of 0, max_window and thus mss will both be 0.
1511 */
1512
1513 /*
1514 * Now we need to check if we have a half built packet.
1515 */
1516
1517 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1518 {
1519 int hdrlen;
1520
1521 /* IP header + TCP header */
1522 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1523 + sizeof(struct tcphdr);
1524
1525 /* Add more stuff to the end of skb->len */
1526 if (!(flags & MSG_OOB))
1527 {
1528 copy = min(sk->mss - (skb->len - hdrlen), len);
1529 /* FIXME: this is really a bug. */
1530 if (copy <= 0)
1531 {
1532 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1533 copy = 0;
1534 }
1535
1536 memcpy_fromfs(skb->data + skb->len, from, copy);
1537 skb->len += copy;
1538 from += copy;
1539 copied += copy;
1540 len -= copy;
1541 sk->write_seq += copy;
1542 }
1543 if ((skb->len - hdrlen) >= sk->mss ||
1544 (flags & MSG_OOB) || !sk->packets_out)
1545 tcp_send_skb(sk, skb);
1546 else
1547 tcp_enqueue_partial(skb, sk);
1548 continue;
1549 }
1550
1551 /*
1552 * We also need to worry about the window.
1553 * If window < 1/2 the maximum window we've seen from this
1554 * host, don't use it. This is sender side
1555 * silly window prevention, as specified in RFC1122.
1556 * (Note that this is different than earlier versions of
1557 * SWS prevention, e.g. RFC813.). What we actually do is
1558 * use the whole MSS. Since the results in the right
1559 * edge of the packet being outside the window, it will
1560 * be queued for later rather than sent.
1561 */
1562
1563 copy = sk->window_seq - sk->write_seq;
1564 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1565 copy = sk->mss;
1566 if (copy > len)
1567 copy = len;
1568
1569 /*
1570 * We should really check the window here also.
1571 */
1572
1573 send_tmp = NULL;
1574 if (copy < sk->mss && !(flags & MSG_OOB))
1575 {
1576 /*
1577 * We will release the socket in case we sleep here.
1578 */
1579 release_sock(sk);
1580 /*
1581 * NB: following must be mtu, because mss can be increased.
1582 * mss is always <= mtu
1583 */
1584 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
1585 sk->inuse = 1;
1586 send_tmp = skb;
1587 }
1588 else
1589 {
1590 /*
1591 * We will release the socket in case we sleep here.
1592 */
1593 release_sock(sk);
1594 skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
1595 sk->inuse = 1;
1596 }
1597
1598 /*
1599 * If we didn't get any memory, we need to sleep.
1600 */
1601
1602 if (skb == NULL)
1603 {
1604 sk->socket->flags |= SO_NOSPACE;
1605 if (nonblock)
1606 {
1607 release_sock(sk);
1608 if (copied)
1609 return(copied);
1610 return(-EAGAIN);
1611 }
1612
1613 /*
1614 * FIXME: here is another race condition.
1615 */
1616
1617 tmp = sk->wmem_alloc;
1618 release_sock(sk);
1619 cli();
1620 /*
1621 * Again we will try to avoid it.
1622 */
1623 if (tmp <= sk->wmem_alloc &&
1624 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1625 && sk->err == 0)
1626 {
1627 sk->socket->flags &= ~SO_NOSPACE;
1628 interruptible_sleep_on(sk->sleep);
1629 if (current->signal & ~current->blocked)
1630 {
1631 sti();
1632 if (copied)
1633 return(copied);
1634 return(-ERESTARTSYS);
1635 }
1636 }
1637 sk->inuse = 1;
1638 sti();
1639 continue;
1640 }
1641
1642 skb->len = 0;
1643 skb->sk = sk;
1644 skb->free = 0;
1645 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1646
1647 buff = skb->data;
1648
1649 /*
1650 * FIXME: we need to optimize this.
1651 * Perhaps some hints here would be good.
1652 */
1653
1654 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1655 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
1656 if (tmp < 0 )
1657 {
1658 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1659 release_sock(sk);
1660 if (copied)
1661 return(copied);
1662 return(tmp);
1663 }
1664 skb->len += tmp;
1665 skb->dev = dev;
1666 buff += tmp;
1667 skb->h.th =(struct tcphdr *) buff;
1668 tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
1669 if (tmp < 0)
1670 {
1671 prot->wfree(sk, skb->mem_addr, skb->mem_len);
1672 release_sock(sk);
1673 if (copied)
1674 return(copied);
1675 return(tmp);
1676 }
1677
1678 if (flags & MSG_OOB)
1679 {
1680 ((struct tcphdr *)buff)->urg = 1;
1681 ((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
1682 }
1683 skb->len += tmp;
1684 memcpy_fromfs(buff+tmp, from, copy);
1685
1686 from += copy;
1687 copied += copy;
1688 len -= copy;
1689 skb->len += copy;
1690 skb->free = 0;
1691 sk->write_seq += copy;
1692
1693 if (send_tmp != NULL && sk->packets_out)
1694 {
1695 tcp_enqueue_partial(send_tmp, sk);
1696 continue;
1697 }
1698 tcp_send_skb(sk, skb);
1699 }
1700 sk->err = 0;
1701
1702 /*
1703 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1704 * interactive fast network servers. It's meant to be on and
1705 * it really improves the throughput though not the echo time
1706 * on my slow slip link - Alan
1707 */
1708
1709 /*
1710 * Avoid possible race on send_tmp - c/o Johannes Stille
1711 */
1712
1713 if(sk->partial && ((!sk->packets_out)
1714 /* If not nagling we can send on the before case too.. */
1715 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1716 ))
1717 tcp_send_partial(sk);
1718
1719 release_sock(sk);
1720 return(copied);
1721 }
1722
1723 /*
1724 * This is just a wrapper.
1725 */
1726
1727 static int tcp_sendto(struct sock *sk, unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1728 int len, int nonblock, unsigned flags,
1729 struct sockaddr_in *addr, int addr_len)
1730 {
1731 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1732 return -EINVAL;
1733 if (sk->state == TCP_CLOSE)
1734 return -ENOTCONN;
1735 if (addr_len < sizeof(*addr))
1736 return -EINVAL;
1737 if (addr->sin_family && addr->sin_family != AF_INET)
1738 return -EINVAL;
1739 if (addr->sin_port != sk->dummy_th.dest)
1740 return -EISCONN;
1741 if (addr->sin_addr.s_addr != sk->daddr)
1742 return -EISCONN;
1743 return tcp_write(sk, from, len, nonblock, flags);
1744 }
1745
1746
1747 /*
1748 * Send an ack if one is backlogged at this point. Ought to merge
1749 * this with tcp_send_ack().
1750 */
1751
1752 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1753 {
1754 int tmp;
1755 struct device *dev = NULL;
1756 struct tcphdr *t1;
1757 struct sk_buff *buff;
1758
1759 if (!sk->ack_backlog)
1760 return;
1761
1762 /*
1763 * FIXME: we need to put code here to prevent this routine from
1764 * being called. Being called once in a while is ok, so only check
1765 * if this is the second time in a row.
1766 */
1767
1768 /*
1769 * We need to grab some memory, and put together an ack,
1770 * and then put it into the queue to be sent.
1771 */
1772
1773 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1774 if (buff == NULL)
1775 {
1776 /* Try again real soon. */
1777 reset_xmit_timer(sk, TIME_WRITE, HZ);
1778 return;
1779 }
1780
1781 buff->len = sizeof(struct tcphdr);
1782 buff->sk = sk;
1783 buff->localroute = sk->localroute;
1784
1785 /*
1786 * Put in the IP header and routing stuff.
1787 */
1788
1789 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1790 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1791 if (tmp < 0)
1792 {
1793 buff->free = 1;
1794 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
1795 return;
1796 }
1797
1798 buff->len += tmp;
1799 t1 =(struct tcphdr *)(buff->data +tmp);
1800
1801 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1802 t1->seq = htonl(sk->sent_seq);
1803 t1->ack = 1;
1804 t1->res1 = 0;
1805 t1->res2 = 0;
1806 t1->rst = 0;
1807 t1->urg = 0;
1808 t1->syn = 0;
1809 t1->psh = 0;
1810 sk->ack_backlog = 0;
1811 sk->bytes_rcv = 0;
1812 sk->window = tcp_select_window(sk);
1813 t1->window = ntohs(sk->window);
1814 t1->ack_seq = ntohl(sk->acked_seq);
1815 t1->doff = sizeof(*t1)/4;
1816 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
1817 sk->prot->queue_xmit(sk, dev, buff, 1);
1818 tcp_statistics.TcpOutSegs++;
1819 }
1820
1821
1822 /*
1823 * FIXME:
1824 * This routine frees used buffers.
1825 * It should consider sending an ACK to let the
1826 * other end know we now have a bigger window.
1827 */
1828
1829 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1830 {
1831 unsigned long flags;
1832 unsigned long left;
1833 struct sk_buff *skb;
1834 unsigned long rspace;
1835
1836 if(sk->debug)
1837 printk("cleaning rbuf for sk=%p\n", sk);
1838
1839 save_flags(flags);
1840 cli();
1841
1842 left = sk->prot->rspace(sk);
1843
1844 /*
1845 * We have to loop through all the buffer headers,
1846 * and try to free up all the space we can.
1847 */
1848
1849 while((skb=skb_peek(&sk->receive_queue)) != NULL)
1850 {
1851 if (!skb->used || skb->users)
1852 break;
1853 skb_unlink(skb);
1854 skb->sk = sk;
1855 kfree_skb(skb, FREE_READ);
1856 }
1857
1858 restore_flags(flags);
1859
1860 /*
1861 * FIXME:
1862 * At this point we should send an ack if the difference
1863 * in the window, and the amount of space is bigger than
1864 * TCP_WINDOW_DIFF.
1865 */
1866
1867 if(sk->debug)
1868 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
1869 left);
1870 if ((rspace=sk->prot->rspace(sk)) != left)
1871 {
1872 /*
1873 * This area has caused the most trouble. The current strategy
1874 * is to simply do nothing if the other end has room to send at
1875 * least 3 full packets, because the ack from those will auto-
1876 * matically update the window. If the other end doesn't think
1877 * we have much space left, but we have room for at least 1 more
1878 * complete packet than it thinks we do, we will send an ack
1879 * immediately. Otherwise we will wait up to .5 seconds in case
1880 * the user reads some more.
1881 */
1882 sk->ack_backlog++;
1883 /*
1884 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1885 * if the other end is offering a window smaller than the agreed on MSS
1886 * (called sk->mtu here). In theory there's no connection between send
1887 * and receive, and so no reason to think that they're going to send
1888 * small packets. For the moment I'm using the hack of reducing the mss
1889 * only on the send side, so I'm putting mtu here.
1890 */
1891
1892 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1893 {
1894 /* Send an ack right now. */
1895 tcp_read_wakeup(sk);
1896 }
1897 else
1898 {
1899 /* Force it to send an ack soon. */
1900 int was_active = del_timer(&sk->retransmit_timer);
1901 if (!was_active || TCP_ACK_TIME < sk->timer.expires)
1902 {
1903 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1904 }
1905 else
1906 add_timer(&sk->retransmit_timer);
1907 }
1908 }
1909 }
1910
1911
1912 /*
1913 * Handle reading urgent data. BSD has very simple semantics for
1914 * this, no blocking and very strange errors 8)
1915 */
1916
1917 static int tcp_read_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1918 unsigned char *to, int len, unsigned flags)
1919 {
1920 /*
1921 * No URG data to read
1922 */
1923 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1924 return -EINVAL; /* Yes this is right ! */
1925
1926 if (sk->err)
1927 {
1928 int tmp = -sk->err;
1929 sk->err = 0;
1930 return tmp;
1931 }
1932
1933 if (sk->state == TCP_CLOSE || sk->done)
1934 {
1935 if (!sk->done) {
1936 sk->done = 1;
1937 return 0;
1938 }
1939 return -ENOTCONN;
1940 }
1941
1942 if (sk->shutdown & RCV_SHUTDOWN)
1943 {
1944 sk->done = 1;
1945 return 0;
1946 }
1947 sk->inuse = 1;
1948 if (sk->urg_data & URG_VALID)
1949 {
1950 char c = sk->urg_data;
1951 if (!(flags & MSG_PEEK))
1952 sk->urg_data = URG_READ;
1953 put_fs_byte(c, to);
1954 release_sock(sk);
1955 return 1;
1956 }
1957 release_sock(sk);
1958
1959 /*
1960 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1961 * the available implementations agree in this case:
1962 * this call should never block, independent of the
1963 * blocking state of the socket.
1964 * Mike <pall@rz.uni-karlsruhe.de>
1965 */
1966 return -EAGAIN;
1967 }
1968
1969
1970 /*
1971 * This routine copies from a sock struct into the user buffer.
1972 */
1973
1974 static int tcp_read(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1975 int len, int nonblock, unsigned flags)
1976 {
1977 struct wait_queue wait = { current, NULL };
1978 int copied = 0;
1979 u32 peek_seq;
1980 volatile u32 *seq; /* So gcc doesn't overoptimise */
1981 unsigned long used;
1982
1983 /*
1984 * This error should be checked.
1985 */
1986
1987 if (sk->state == TCP_LISTEN)
1988 return -ENOTCONN;
1989
1990 /*
1991 * Urgent data needs to be handled specially.
1992 */
1993
1994 if (flags & MSG_OOB)
1995 return tcp_read_urg(sk, nonblock, to, len, flags);
1996
1997 /*
1998 * Copying sequence to update. This is volatile to handle
1999 * the multi-reader case neatly (memcpy_to/fromfs might be
2000 * inline and thus not flush cached variables otherwise).
2001 */
2002
2003 peek_seq = sk->copied_seq;
2004 seq = &sk->copied_seq;
2005 if (flags & MSG_PEEK)
2006 seq = &peek_seq;
2007
2008 add_wait_queue(sk->sleep, &wait);
2009 sk->inuse = 1;
2010 while (len > 0)
2011 {
2012 struct sk_buff * skb;
2013 unsigned long offset;
2014
2015 /*
2016 * Are we at urgent data? Stop if we have read anything.
2017 */
2018
2019 if (copied && sk->urg_data && sk->urg_seq == *seq)
2020 break;
2021
2022 /*
2023 * Next get a buffer.
2024 */
2025
2026 current->state = TASK_INTERRUPTIBLE;
2027
2028 skb = skb_peek(&sk->receive_queue);
2029 do
2030 {
2031 if (!skb)
2032 break;
2033 if (before(*seq, skb->h.th->seq))
2034 break;
2035 offset = *seq - skb->h.th->seq;
2036 if (skb->h.th->syn)
2037 offset--;
2038 if (offset < skb->len)
2039 goto found_ok_skb;
2040 if (skb->h.th->fin)
2041 goto found_fin_ok;
2042 if (!(flags & MSG_PEEK))
2043 skb->used = 1;
2044 skb = skb->next;
2045 }
2046 while (skb != (struct sk_buff *)&sk->receive_queue);
2047
2048 if (copied)
2049 break;
2050
2051 if (sk->err)
2052 {
2053 copied = -sk->err;
2054 sk->err = 0;
2055 break;
2056 }
2057
2058 if (sk->state == TCP_CLOSE)
2059 {
2060 if (!sk->done)
2061 {
2062 sk->done = 1;
2063 break;
2064 }
2065 copied = -ENOTCONN;
2066 break;
2067 }
2068
2069 if (sk->shutdown & RCV_SHUTDOWN)
2070 {
2071 sk->done = 1;
2072 break;
2073 }
2074
2075 if (nonblock)
2076 {
2077 copied = -EAGAIN;
2078 break;
2079 }
2080
2081 cleanup_rbuf(sk);
2082 release_sock(sk);
2083 sk->socket->flags |= SO_WAITDATA;
2084 schedule();
2085 sk->socket->flags &= ~SO_WAITDATA;
2086 sk->inuse = 1;
2087
2088 if (current->signal & ~current->blocked)
2089 {
2090 copied = -ERESTARTSYS;
2091 break;
2092 }
2093 continue;
2094
2095 found_ok_skb:
2096 /*
2097 * Lock the buffer. We can be fairly relaxed as
2098 * an interrupt will never steal a buffer we are
2099 * using unless I've missed something serious in
2100 * tcp_data.
2101 */
2102
2103 skb->users++;
2104
2105 /*
2106 * Ok so how much can we use ?
2107 */
2108
2109 used = skb->len - offset;
2110 if (len < used)
2111 used = len;
2112 /*
2113 * Do we have urgent data here?
2114 */
2115
2116 if (sk->urg_data)
2117 {
2118 unsigned long urg_offset = sk->urg_seq - *seq;
2119 if (urg_offset < used)
2120 {
2121 if (!urg_offset)
2122 {
2123 if (!sk->urginline)
2124 {
2125 ++*seq;
2126 offset++;
2127 used--;
2128 }
2129 }
2130 else
2131 used = urg_offset;
2132 }
2133 }
2134
2135 /*
2136 * Copy it - We _MUST_ update *seq first so that we
2137 * don't ever double read when we have dual readers
2138 */
2139
2140 *seq += used;
2141
2142 /*
2143 * This memcpy_tofs can sleep. If it sleeps and we
2144 * do a second read it relies on the skb->users to avoid
2145 * a crash when cleanup_rbuf() gets called.
2146 */
2147
2148 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2149 skb->h.th->doff*4 + offset, used);
2150 copied += used;
2151 len -= used;
2152 to += used;
2153
2154 /*
2155 * We now will not sleep again until we are finished
2156 * with skb. Sorry if you are doing the SMP port
2157 * but you'll just have to fix it neatly ;)
2158 */
2159
2160 skb->users --;
2161
2162 if (after(sk->copied_seq,sk->urg_seq))
2163 sk->urg_data = 0;
2164 if (used + offset < skb->len)
2165 continue;
2166
2167 /*
2168 * Process the FIN.
2169 */
2170
2171 if (skb->h.th->fin)
2172 goto found_fin_ok;
2173 if (flags & MSG_PEEK)
2174 continue;
2175 skb->used = 1;
2176 continue;
2177
2178 found_fin_ok:
2179 ++*seq;
2180 if (flags & MSG_PEEK)
2181 break;
2182
2183 /*
2184 * All is done
2185 */
2186
2187 skb->used = 1;
2188 sk->shutdown |= RCV_SHUTDOWN;
2189 break;
2190
2191 }
2192 remove_wait_queue(sk->sleep, &wait);
2193 current->state = TASK_RUNNING;
2194
2195 /* Clean up data we have read: This will do ACK frames */
2196 cleanup_rbuf(sk);
2197 release_sock(sk);
2198 return copied;
2199 }
2200
2201 /*
2202 * State processing on a close. This implements the state shift for
2203 * sending our FIN frame. Note that we only send a FIN for some
2204 * states. A shutdown() may have already sent the FIN, or we may be
2205 * closed.
2206 */
2207
2208 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2209 {
2210 int ns=TCP_CLOSE;
2211 int send_fin=0;
2212 switch(sk->state)
2213 {
2214 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2215 break;
2216 case TCP_SYN_RECV:
2217 case TCP_ESTABLISHED: /* Closedown begin */
2218 ns=TCP_FIN_WAIT1;
2219 send_fin=1;
2220 break;
2221 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2222 case TCP_FIN_WAIT2:
2223 case TCP_CLOSING:
2224 ns=sk->state;
2225 break;
2226 case TCP_CLOSE:
2227 case TCP_LISTEN:
2228 break;
2229 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2230 wait only for the ACK */
2231 ns=TCP_LAST_ACK;
2232 send_fin=1;
2233 }
2234
2235 tcp_set_state(sk,ns);
2236
2237 /*
2238 * This is a (useful) BSD violating of the RFC. There is a
2239 * problem with TCP as specified in that the other end could
2240 * keep a socket open forever with no application left this end.
2241 * We use a 3 minute timeout (about the same as BSD) then kill
2242 * our end. If they send after that then tough - BUT: long enough
2243 * that we won't make the old 4*rto = almost no time - whoops
2244 * reset mistake.
2245 */
2246 if(dead && ns==TCP_FIN_WAIT2)
2247 {
2248 int timer_active=del_timer(&sk->timer);
2249 if(timer_active)
2250 add_timer(&sk->timer);
2251 else
2252 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2253 }
2254
2255 return send_fin;
2256 }
2257
2258 /*
2259 * Send a fin.
2260 */
2261
2262 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2263 {
2264 struct proto *prot =(struct proto *)sk->prot;
2265 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2266 struct tcphdr *t1;
2267 struct sk_buff *buff;
2268 struct device *dev=NULL;
2269 int tmp;
2270
2271 release_sock(sk); /* in case the malloc sleeps. */
2272
2273 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2274 sk->inuse = 1;
2275
2276 if (buff == NULL)
2277 {
2278 /* This is a disaster if it occurs */
2279 printk("tcp_send_fin: Impossible malloc failure");
2280 return;
2281 }
2282
2283 /*
2284 * Administrivia
2285 */
2286
2287 buff->sk = sk;
2288 buff->len = sizeof(*t1);
2289 buff->localroute = sk->localroute;
2290 t1 =(struct tcphdr *) buff->data;
2291
2292 /*
2293 * Put in the IP header and routing stuff.
2294 */
2295
2296 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2297 IPPROTO_TCP, sk->opt,
2298 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2299 if (tmp < 0)
2300 {
2301 int t;
2302 /*
2303 * Finish anyway, treat this as a send that got lost.
2304 * (Not good).
2305 */
2306
2307 buff->free = 1;
2308 prot->wfree(sk,buff->mem_addr, buff->mem_len);
2309 sk->write_seq++;
2310 t=del_timer(&sk->timer);
2311 if(t)
2312 add_timer(&sk->timer);
2313 else
2314 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2315 return;
2316 }
2317
2318 /*
2319 * We ought to check if the end of the queue is a buffer and
2320 * if so simply add the fin to that buffer, not send it ahead.
2321 */
2322
2323 t1 =(struct tcphdr *)((char *)t1 +tmp);
2324 buff->len += tmp;
2325 buff->dev = dev;
2326 memcpy(t1, th, sizeof(*t1));
2327 t1->seq = ntohl(sk->write_seq);
2328 sk->write_seq++;
2329 buff->h.seq = sk->write_seq;
2330 t1->ack = 1;
2331 t1->ack_seq = ntohl(sk->acked_seq);
2332 t1->window = ntohs(sk->window=tcp_select_window(sk));
2333 t1->fin = 1;
2334 t1->rst = 0;
2335 t1->doff = sizeof(*t1)/4;
2336 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2337
2338 /*
2339 * If there is data in the write queue, the fin must be appended to
2340 * the write queue.
2341 */
2342
2343 if (skb_peek(&sk->write_queue) != NULL)
2344 {
2345 buff->free = 0;
2346 if (buff->next != NULL)
2347 {
2348 printk("tcp_send_fin: next != NULL\n");
2349 skb_unlink(buff);
2350 }
2351 skb_queue_tail(&sk->write_queue, buff);
2352 }
2353 else
2354 {
2355 sk->sent_seq = sk->write_seq;
2356 sk->prot->queue_xmit(sk, dev, buff, 0);
2357 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2358 }
2359 }
2360
2361 /*
2362 * Shutdown the sending side of a connection. Much like close except
2363 * that we don't receive shut down or set sk->dead=1.
2364 */
2365
2366 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2367 {
2368 /*
2369 * We need to grab some memory, and put together a FIN,
2370 * and then put it into the queue to be sent.
2371 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2372 */
2373
2374 if (!(how & SEND_SHUTDOWN))
2375 return;
2376
2377 /*
2378 * If we've already sent a FIN, or it's a closed state
2379 */
2380
2381 if (sk->state == TCP_FIN_WAIT1 ||
2382 sk->state == TCP_FIN_WAIT2 ||
2383 sk->state == TCP_CLOSING ||
2384 sk->state == TCP_LAST_ACK ||
2385 sk->state == TCP_TIME_WAIT ||
2386 sk->state == TCP_CLOSE ||
2387 sk->state == TCP_LISTEN
2388 )
2389 {
2390 return;
2391 }
2392 sk->inuse = 1;
2393
2394 /*
2395 * flag that the sender has shutdown
2396 */
2397
2398 sk->shutdown |= SEND_SHUTDOWN;
2399
2400 /*
2401 * Clear out any half completed packets.
2402 */
2403
2404 if (sk->partial)
2405 tcp_send_partial(sk);
2406
2407 /*
2408 * FIN if needed
2409 */
2410
2411 if(tcp_close_state(sk,0))
2412 tcp_send_fin(sk);
2413
2414 release_sock(sk);
2415 }
2416
2417
2418 static int
2419 tcp_recvfrom(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2420 int to_len, int nonblock, unsigned flags,
2421 struct sockaddr_in *addr, int *addr_len)
2422 {
2423 int result;
2424
2425 /*
2426 * Have to check these first unlike the old code. If
2427 * we check them after we lose data on an error
2428 * which is wrong
2429 */
2430
2431 if(addr_len)
2432 *addr_len = sizeof(*addr);
2433 result=tcp_read(sk, to, to_len, nonblock, flags);
2434
2435 if (result < 0)
2436 return(result);
2437
2438 if(addr)
2439 {
2440 addr->sin_family = AF_INET;
2441 addr->sin_port = sk->dummy_th.dest;
2442 addr->sin_addr.s_addr = sk->daddr;
2443 }
2444 return(result);
2445 }
2446
2447
2448 /*
2449 * This routine will send an RST to the other tcp.
2450 */
2451
2452 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2453 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2454 {
2455 struct sk_buff *buff;
2456 struct tcphdr *t1;
2457 int tmp;
2458 struct device *ndev=NULL;
2459
2460 /*
2461 * Cannot reset a reset (Think about it).
2462 */
2463
2464 if(th->rst)
2465 return;
2466
2467 /*
2468 * We need to grab some memory, and put together an RST,
2469 * and then put it into the queue to be sent.
2470 */
2471
2472 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2473 if (buff == NULL)
2474 return;
2475
2476 buff->len = sizeof(*t1);
2477 buff->sk = NULL;
2478 buff->dev = dev;
2479 buff->localroute = 0;
2480
2481 t1 =(struct tcphdr *) buff->data;
2482
2483 /*
2484 * Put in the IP header and routing stuff.
2485 */
2486
2487 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2488 sizeof(struct tcphdr),tos,ttl);
2489 if (tmp < 0)
2490 {
2491 buff->free = 1;
2492 prot->wfree(NULL, buff->mem_addr, buff->mem_len);
2493 return;
2494 }
2495
2496 t1 =(struct tcphdr *)((char *)t1 +tmp);
2497 buff->len += tmp;
2498 memcpy(t1, th, sizeof(*t1));
2499
2500 /*
2501 * Swap the send and the receive.
2502 */
2503
2504 t1->dest = th->source;
2505 t1->source = th->dest;
2506 t1->rst = 1;
2507 t1->window = 0;
2508
2509 if(th->ack)
2510 {
2511 t1->ack = 0;
2512 t1->seq = th->ack_seq;
2513 t1->ack_seq = 0;
2514 }
2515 else
2516 {
2517 t1->ack = 1;
2518 if(!th->syn)
2519 t1->ack_seq=htonl(th->seq);
2520 else
2521 t1->ack_seq=htonl(th->seq+1);
2522 t1->seq=0;
2523 }
2524
2525 t1->syn = 0;
2526 t1->urg = 0;
2527 t1->fin = 0;
2528 t1->psh = 0;
2529 t1->doff = sizeof(*t1)/4;
2530 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2531 prot->queue_xmit(NULL, ndev, buff, 1);
2532 tcp_statistics.TcpOutSegs++;
2533 }
2534
2535
2536 /*
2537 * Look for tcp options. Parses everything but only knows about MSS.
2538 * This routine is always called with the packet containing the SYN.
2539 * However it may also be called with the ack to the SYN. So you
2540 * can't assume this is always the SYN. It's always called after
2541 * we have set up sk->mtu to our own MTU.
2542 *
2543 * We need at minimum to add PAWS support here. Possibly large windows
2544 * as Linux gets deployed on 100Mb/sec networks.
2545 */
2546
2547 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2548 {
2549 unsigned char *ptr;
2550 int length=(th->doff*4)-sizeof(struct tcphdr);
2551 int mss_seen = 0;
2552
2553 ptr = (unsigned char *)(th + 1);
2554
2555 while(length>0)
2556 {
2557 int opcode=*ptr++;
2558 int opsize=*ptr++;
2559 switch(opcode)
2560 {
2561 case TCPOPT_EOL:
2562 return;
2563 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2564 length--;
2565 ptr--; /* the opsize=*ptr++ above was a mistake */
2566 continue;
2567
2568 default:
2569 if(opsize<=2) /* Avoid silly options looping forever */
2570 return;
2571 switch(opcode)
2572 {
2573 case TCPOPT_MSS:
2574 if(opsize==4 && th->syn)
2575 {
2576 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2577 mss_seen = 1;
2578 }
2579 break;
2580 /* Add other options here as people feel the urge to implement stuff like large windows */
2581 }
2582 ptr+=opsize-2;
2583 length-=opsize;
2584 }
2585 }
2586 if (th->syn)
2587 {
2588 if (! mss_seen)
2589 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2590 }
2591 #ifdef CONFIG_INET_PCTCP
2592 sk->mss = min(sk->max_window >> 1, sk->mtu);
2593 #else
2594 sk->mss = min(sk->max_window, sk->mtu);
2595 #endif
2596 }
2597
2598 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2599 {
2600 dst = ntohl(dst);
2601 if (IN_CLASSA(dst))
2602 return htonl(IN_CLASSA_NET);
2603 if (IN_CLASSB(dst))
2604 return htonl(IN_CLASSB_NET);
2605 return htonl(IN_CLASSC_NET);
2606 }
2607
2608 /*
2609 * Default sequence number picking algorithm.
2610 * As close as possible to RFC 793, which
2611 * suggests using a 250kHz clock.
2612 * Further reading shows this assumes 2MB/s networks.
2613 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2614 * That's funny, Linux has one built in! Use it!
2615 */
2616
2617 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2618 {
2619 struct timeval tv;
2620 do_gettimeofday(&tv);
2621 return tv.tv_usec+tv.tv_sec*1000000;
2622 }
2623
2624 /*
2625 * This routine handles a connection request.
2626 * It should make sure we haven't already responded.
2627 * Because of the way BSD works, we have to send a syn/ack now.
2628 * This also means it will be harder to close a socket which is
2629 * listening.
2630 */
2631
2632 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2633 unsigned long daddr, unsigned long saddr,
2634 struct options *opt, struct device *dev, u32 seq)
2635 {
2636 struct sk_buff *buff;
2637 struct tcphdr *t1;
2638 unsigned char *ptr;
2639 struct sock *newsk;
2640 struct tcphdr *th;
2641 struct device *ndev=NULL;
2642 int tmp;
2643 struct rtable *rt;
2644
2645 th = skb->h.th;
2646
2647 /* If the socket is dead, don't accept the connection. */
2648 if (!sk->dead)
2649 {
2650 sk->data_ready(sk,0);
2651 }
2652 else
2653 {
2654 if(sk->debug)
2655 printk("Reset on %p: Connect on dead socket.\n",sk);
2656 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2657 tcp_statistics.TcpAttemptFails++;
2658 kfree_skb(skb, FREE_READ);
2659 return;
2660 }
2661
2662 /*
2663 * Make sure we can accept more. This will prevent a
2664 * flurry of syns from eating up all our memory.
2665 */
2666
2667 if (sk->ack_backlog >= sk->max_ack_backlog)
2668 {
2669 tcp_statistics.TcpAttemptFails++;
2670 kfree_skb(skb, FREE_READ);
2671 return;
2672 }
2673
2674 /*
2675 * We need to build a new sock struct.
2676 * It is sort of bad to have a socket without an inode attached
2677 * to it, but the wake_up's will just wake up the listening socket,
2678 * and if the listening socket is destroyed before this is taken
2679 * off of the queue, this will take care of it.
2680 */
2681
2682 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2683 if (newsk == NULL)
2684 {
2685 /* just ignore the syn. It will get retransmitted. */
2686 tcp_statistics.TcpAttemptFails++;
2687 kfree_skb(skb, FREE_READ);
2688 return;
2689 }
2690
2691 memcpy(newsk, sk, sizeof(*newsk));
2692 skb_queue_head_init(&newsk->write_queue);
2693 skb_queue_head_init(&newsk->receive_queue);
2694 newsk->send_head = NULL;
2695 newsk->send_tail = NULL;
2696 skb_queue_head_init(&newsk->back_log);
2697 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
2698 newsk->rto = TCP_TIMEOUT_INIT;
2699 newsk->mdev = 0;
2700 newsk->max_window = 0;
2701 newsk->cong_window = 1;
2702 newsk->cong_count = 0;
2703 newsk->ssthresh = 0;
2704 newsk->backoff = 0;
2705 newsk->blog = 0;
2706 newsk->intr = 0;
2707 newsk->proc = 0;
2708 newsk->done = 0;
2709 newsk->partial = NULL;
2710 newsk->pair = NULL;
2711 newsk->wmem_alloc = 0;
2712 newsk->rmem_alloc = 0;
2713 newsk->localroute = sk->localroute;
2714
2715 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2716
2717 newsk->err = 0;
2718 newsk->shutdown = 0;
2719 newsk->ack_backlog = 0;
2720 newsk->acked_seq = skb->h.th->seq+1;
2721 newsk->copied_seq = skb->h.th->seq+1;
2722 newsk->fin_seq = skb->h.th->seq;
2723 newsk->state = TCP_SYN_RECV;
2724 newsk->timeout = 0;
2725 newsk->ip_xmit_timeout = 0;
2726 newsk->write_seq = seq;
2727 newsk->window_seq = newsk->write_seq;
2728 newsk->rcv_ack_seq = newsk->write_seq;
2729 newsk->urg_data = 0;
2730 newsk->retransmits = 0;
2731 newsk->linger=0;
2732 newsk->destroy = 0;
2733 init_timer(&newsk->timer);
2734 newsk->timer.data = (unsigned long)newsk;
2735 newsk->timer.function = &net_timer;
2736 init_timer(&newsk->retransmit_timer);
2737 newsk->retransmit_timer.data = (unsigned long)newsk;
2738 newsk->retransmit_timer.function=&retransmit_timer;
2739 newsk->dummy_th.source = skb->h.th->dest;
2740 newsk->dummy_th.dest = skb->h.th->source;
2741
2742 /*
2743 * Swap these two, they are from our point of view.
2744 */
2745
2746 newsk->daddr = saddr;
2747 newsk->saddr = daddr;
2748
2749 put_sock(newsk->num,newsk);
2750 newsk->dummy_th.res1 = 0;
2751 newsk->dummy_th.doff = 6;
2752 newsk->dummy_th.fin = 0;
2753 newsk->dummy_th.syn = 0;
2754 newsk->dummy_th.rst = 0;
2755 newsk->dummy_th.psh = 0;
2756 newsk->dummy_th.ack = 0;
2757 newsk->dummy_th.urg = 0;
2758 newsk->dummy_th.res2 = 0;
2759 newsk->acked_seq = skb->h.th->seq + 1;
2760 newsk->copied_seq = skb->h.th->seq + 1;
2761 newsk->socket = NULL;
2762
2763 /*
2764 * Grab the ttl and tos values and use them
2765 */
2766
2767 newsk->ip_ttl=sk->ip_ttl;
2768 newsk->ip_tos=skb->ip_hdr->tos;
2769
2770 /*
2771 * Use 512 or whatever user asked for
2772 */
2773
2774 /*
2775 * Note use of sk->user_mss, since user has no direct access to newsk
2776 */
2777
2778 rt=ip_rt_route(saddr, NULL,NULL);
2779
2780 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2781 newsk->window_clamp = rt->rt_window;
2782 else
2783 newsk->window_clamp = 0;
2784
2785 if (sk->user_mss)
2786 newsk->mtu = sk->user_mss;
2787 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
2788 newsk->mtu = rt->rt_mss - HEADER_SIZE;
2789 else
2790 {
2791 #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */
2792 if ((saddr ^ daddr) & default_mask(saddr))
2793 #else
2794 if ((saddr ^ daddr) & dev->pa_mask)
2795 #endif
2796 newsk->mtu = 576 - HEADER_SIZE;
2797 else
2798 newsk->mtu = MAX_WINDOW;
2799 }
2800
2801 /*
2802 * But not bigger than device MTU
2803 */
2804
2805 newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
2806
2807 /*
2808 * This will min with what arrived in the packet
2809 */
2810
2811 tcp_options(newsk,skb->h.th);
2812
2813 tcp_cache_zap();
2814
2815 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
2816 if (buff == NULL)
2817 {
2818 sk->err = ENOMEM;
2819 newsk->dead = 1;
2820 newsk->state = TCP_CLOSE;
2821 /* And this will destroy it */
2822 release_sock(newsk);
2823 kfree_skb(skb, FREE_READ);
2824 tcp_statistics.TcpAttemptFails++;
2825 return;
2826 }
2827
2828 buff->len = sizeof(struct tcphdr)+4;
2829 buff->sk = newsk;
2830 buff->localroute = newsk->localroute;
2831
2832 t1 =(struct tcphdr *) buff->data;
2833
2834 /*
2835 * Put in the IP header and routing stuff.
2836 */
2837
2838 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
2839 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
2840
2841 /*
2842 * Something went wrong.
2843 */
2844
2845 if (tmp < 0)
2846 {
2847 sk->err = tmp;
2848 buff->free = 1;
2849 kfree_skb(buff,FREE_WRITE);
2850 newsk->dead = 1;
2851 newsk->state = TCP_CLOSE;
2852 release_sock(newsk);
2853 skb->sk = sk;
2854 kfree_skb(skb, FREE_READ);
2855 tcp_statistics.TcpAttemptFails++;
2856 return;
2857 }
2858
2859 buff->len += tmp;
2860 t1 =(struct tcphdr *)((char *)t1 +tmp);
2861
2862 memcpy(t1, skb->h.th, sizeof(*t1));
2863 buff->h.seq = newsk->write_seq;
2864 /*
2865 * Swap the send and the receive.
2866 */
2867 t1->dest = skb->h.th->source;
2868 t1->source = newsk->dummy_th.source;
2869 t1->seq = ntohl(newsk->write_seq++);
2870 t1->ack = 1;
2871 newsk->window = tcp_select_window(newsk);
2872 newsk->sent_seq = newsk->write_seq;
2873 t1->window = ntohs(newsk->window);
2874 t1->res1 = 0;
2875 t1->res2 = 0;
2876 t1->rst = 0;
2877 t1->urg = 0;
2878 t1->psh = 0;
2879 t1->syn = 1;
2880 t1->ack_seq = ntohl(skb->h.th->seq+1);
2881 t1->doff = sizeof(*t1)/4+1;
2882 ptr =(unsigned char *)(t1+1);
2883 ptr[0] = 2;
2884 ptr[1] = 4;
2885 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
2886 ptr[3] =(newsk->mtu) & 0xff;
2887
2888 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
2889 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
2890 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
2891 skb->sk = newsk;
2892
2893 /*
2894 * Charge the sock_buff to newsk.
2895 */
2896
2897 sk->rmem_alloc -= skb->mem_len;
2898 newsk->rmem_alloc += skb->mem_len;
2899
2900 skb_queue_tail(&sk->receive_queue,skb);
2901 sk->ack_backlog++;
2902 release_sock(newsk);
2903 tcp_statistics.TcpOutSegs++;
2904 }
2905
2906
2907 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2908 {
2909 /*
2910 * We need to grab some memory, and put together a FIN,
2911 * and then put it into the queue to be sent.
2912 */
2913
2914 sk->inuse = 1;
2915
2916 if(th_cache_sk==sk)
2917 tcp_cache_zap();
2918 if(sk->state == TCP_LISTEN)
2919 {
2920 /* Special case */
2921 tcp_set_state(sk, TCP_CLOSE);
2922 tcp_close_pending(sk);
2923 release_sock(sk);
2924 return;
2925 }
2926
2927 sk->keepopen = 1;
2928 sk->shutdown = SHUTDOWN_MASK;
2929
2930 if (!sk->dead)
2931 sk->state_change(sk);
2932
2933 if (timeout == 0)
2934 {
2935 struct sk_buff *skb;
2936
2937 /*
2938 * We need to flush the recv. buffs. We do this only on the
2939 * descriptor close, not protocol-sourced closes, because the
2940 * reader process may not have drained the data yet!
2941 */
2942
2943 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
2944 kfree_skb(skb, FREE_READ);
2945 /*
2946 * Get rid off any half-completed packets.
2947 */
2948
2949 if (sk->partial)
2950 tcp_send_partial(sk);
2951 }
2952
2953
2954 /*
2955 * Timeout is not the same thing - however the code likes
2956 * to send both the same way (sigh).
2957 */
2958
2959 if(timeout)
2960 {
2961 tcp_set_state(sk, TCP_CLOSE); /* Dead */
2962 }
2963 else
2964 {
2965 if(tcp_close_state(sk,1)==1)
2966 {
2967 tcp_send_fin(sk);
2968 }
2969 }
2970 release_sock(sk);
2971 }
2972
2973
2974 /*
2975 * This routine takes stuff off of the write queue,
2976 * and puts it in the xmit queue. This happens as incoming acks
2977 * open up the remote window for us.
2978 */
2979
2980 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2981 {
2982 struct sk_buff *skb;
2983
2984 /*
2985 * The bytes will have to remain here. In time closedown will
2986 * empty the write queue and all will be happy
2987 */
2988
2989 if(sk->zapped)
2990 return;
2991
2992 /*
2993 * Anything on the transmit queue that fits the window can
2994 * be added providing we are not
2995 *
2996 * a) retransmitting (Nagle's rule)
2997 * b) exceeding our congestion window.
2998 */
2999
3000 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3001 before(skb->h.seq, sk->window_seq + 1) &&
3002 (sk->retransmits == 0 ||
3003 sk->ip_xmit_timeout != TIME_WRITE ||
3004 before(skb->h.seq, sk->rcv_ack_seq + 1))
3005 && sk->packets_out < sk->cong_window)
3006 {
3007 IS_SKB(skb);
3008 skb_unlink(skb);
3009
3010 /*
3011 * See if we really need to send the packet.
3012 */
3013
3014 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3015 {
3016 /*
3017 * This is acked data. We can discard it. This
3018 * cannot currently occur.
3019 */
3020
3021 sk->retransmits = 0;
3022 kfree_skb(skb, FREE_WRITE);
3023 if (!sk->dead)
3024 sk->write_space(sk);
3025 }
3026 else
3027 {
3028 struct tcphdr *th;
3029 struct iphdr *iph;
3030 int size;
3031 /*
3032 * put in the ack seq and window at this point rather than earlier,
3033 * in order to keep them monotonic. We really want to avoid taking
3034 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3035 * Ack and window will in general have changed since this packet was put
3036 * on the write queue.
3037 */
3038 iph = (struct iphdr *)(skb->data +
3039 skb->dev->hard_header_len);
3040 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3041 size = skb->len - (((unsigned char *) th) - skb->data);
3042
3043 th->ack_seq = ntohl(sk->acked_seq);
3044 th->window = ntohs(tcp_select_window(sk));
3045
3046 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3047
3048 sk->sent_seq = skb->h.seq;
3049
3050 /*
3051 * IP manages our queue for some crazy reason
3052 */
3053
3054 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3055
3056 /*
3057 * Again we slide the timer wrongly
3058 */
3059
3060 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3061 }
3062 }
3063 }
3064
3065
3066 /*
3067 * This routine deals with incoming acks, but not outgoing ones.
3068 */
3069
3070 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3071 {
3072 u32 ack;
3073 int flag = 0;
3074
3075 /*
3076 * 1 - there was data in packet as well as ack or new data is sent or
3077 * in shutdown state
3078 * 2 - data from retransmit queue was acked and removed
3079 * 4 - window shrunk or data from retransmit queue was acked and removed
3080 */
3081
3082 if(sk->zapped)
3083 return(1); /* Dead, cant ack any more so why bother */
3084
3085 /*
3086 * Have we discovered a larger window
3087 */
3088
3089 ack = ntohl(th->ack_seq);
3090
3091 if (ntohs(th->window) > sk->max_window)
3092 {
3093 sk->max_window = ntohs(th->window);
3094 #ifdef CONFIG_INET_PCTCP
3095 /* Hack because we don't send partial packets to non SWS
3096 handling hosts */
3097 sk->mss = min(sk->max_window>>1, sk->mtu);
3098 #else
3099 sk->mss = min(sk->max_window, sk->mtu);
3100 #endif
3101 }
3102
3103 /*
3104 * We have dropped back to keepalive timeouts. Thus we have
3105 * no retransmits pending.
3106 */
3107
3108 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3109 sk->retransmits = 0;
3110
3111 /*
3112 * If the ack is newer than sent or older than previous acks
3113 * then we can probably ignore it.
3114 */
3115
3116 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3117 {
3118 if(sk->debug)
3119 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3120
3121 /*
3122 * Keepalive processing.
3123 */
3124
3125 if (after(ack, sk->sent_seq))
3126 {
3127 return(0);
3128 }
3129
3130 /*
3131 * Restart the keepalive timer.
3132 */
3133
3134 if (sk->keepopen)
3135 {
3136 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3137 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3138 }
3139 return(1);
3140 }
3141
3142 /*
3143 * If there is data set flag 1
3144 */
3145
3146 if (len != th->doff*4)
3147 flag |= 1;
3148
3149 /*
3150 * See if our window has been shrunk.
3151 */
3152
3153 if (after(sk->window_seq, ack+ntohs(th->window)))
3154 {
3155 /*
3156 * We may need to move packets from the send queue
3157 * to the write queue, if the window has been shrunk on us.
3158 * The RFC says you are not allowed to shrink your window
3159 * like this, but if the other end does, you must be able
3160 * to deal with it.
3161 */
3162 struct sk_buff *skb;
3163 struct sk_buff *skb2;
3164 struct sk_buff *wskb = NULL;
3165
3166 skb2 = sk->send_head;
3167 sk->send_head = NULL;
3168 sk->send_tail = NULL;
3169
3170 /*
3171 * This is an artifact of a flawed concept. We want one
3172 * queue and a smarter send routine when we send all.
3173 */
3174
3175 flag |= 4; /* Window changed */
3176
3177 sk->window_seq = ack + ntohs(th->window);
3178 cli();
3179 while (skb2 != NULL)
3180 {
3181 skb = skb2;
3182 skb2 = skb->link3;
3183 skb->link3 = NULL;
3184 if (after(skb->h.seq, sk->window_seq))
3185 {
3186 if (sk->packets_out > 0)
3187 sk->packets_out--;
3188 /* We may need to remove this from the dev send list. */
3189 if (skb->next != NULL)
3190 {
3191 skb_unlink(skb);
3192 }
3193 /* Now add it to the write_queue. */
3194 if (wskb == NULL)
3195 skb_queue_head(&sk->write_queue,skb);
3196 else
3197 skb_append(wskb,skb);
3198 wskb = skb;
3199 }
3200 else
3201 {
3202 if (sk->send_head == NULL)
3203 {
3204 sk->send_head = skb;
3205 sk->send_tail = skb;
3206 }
3207 else
3208 {
3209 sk->send_tail->link3 = skb;
3210 sk->send_tail = skb;
3211 }
3212 skb->link3 = NULL;
3213 }
3214 }
3215 sti();
3216 }
3217
3218 /*
3219 * Pipe has emptied
3220 */
3221
3222 if (sk->send_tail == NULL || sk->send_head == NULL)
3223 {
3224 sk->send_head = NULL;
3225 sk->send_tail = NULL;
3226 sk->packets_out= 0;
3227 }
3228
3229 /*
3230 * Update the right hand window edge of the host
3231 */
3232
3233 sk->window_seq = ack + ntohs(th->window);
3234
3235 /*
3236 * We don't want too many packets out there.
3237 */
3238
3239 if (sk->ip_xmit_timeout == TIME_WRITE &&
3240 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3241 {
3242 /*
3243 * This is Jacobson's slow start and congestion avoidance.
3244 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3245 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3246 * counter and increment it once every cwnd times. It's possible
3247 * that this should be done only if sk->retransmits == 0. I'm
3248 * interpreting "new data is acked" as including data that has
3249 * been retransmitted but is just now being acked.
3250 */
3251 if (sk->cong_window < sk->ssthresh)
3252 /*
3253 * In "safe" area, increase
3254 */
3255 sk->cong_window++;
3256 else
3257 {
3258 /*
3259 * In dangerous area, increase slowly. In theory this is
3260 * sk->cong_window += 1 / sk->cong_window
3261 */
3262 if (sk->cong_count >= sk->cong_window)
3263 {
3264 sk->cong_window++;
3265 sk->cong_count = 0;
3266 }
3267 else
3268 sk->cong_count++;
3269 }
3270 }
3271
3272 /*
3273 * Remember the highest ack received.
3274 */
3275
3276 sk->rcv_ack_seq = ack;
3277
3278 /*
3279 * If this ack opens up a zero window, clear backoff. It was
3280 * being used to time the probes, and is probably far higher than
3281 * it needs to be for normal retransmission.
3282 */
3283
3284 if (sk->ip_xmit_timeout == TIME_PROBE0)
3285 {
3286 sk->retransmits = 0; /* Our probe was answered */
3287
3288 /*
3289 * Was it a usable window open ?
3290 */
3291
3292 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3293 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3294 {
3295 sk->backoff = 0;
3296
3297 /*
3298 * Recompute rto from rtt. this eliminates any backoff.
3299 */
3300
3301 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3302 if (sk->rto > 120*HZ)
3303 sk->rto = 120*HZ;
3304 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3305 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3306 .2 of a second is going to need huge windows (SIGH) */
3307 sk->rto = 20;
3308 }
3309 }
3310
3311 /*
3312 * See if we can take anything off of the retransmit queue.
3313 */
3314
3315 while(sk->send_head != NULL)
3316 {
3317 /* Check for a bug. */
3318 if (sk->send_head->link3 &&
3319 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3320 printk("INET: tcp.c: *** bug send_list out of order.\n");
3321
3322 /*
3323 * If our packet is before the ack sequence we can
3324 * discard it as it's confirmed to have arrived the other end.
3325 */
3326
3327 if (before(sk->send_head->h.seq, ack+1))
3328 {
3329 struct sk_buff *oskb;
3330 if (sk->retransmits)
3331 {
3332 /*
3333 * We were retransmitting. don't count this in RTT est
3334 */
3335 flag |= 2;
3336
3337 /*
3338 * even though we've gotten an ack, we're still
3339 * retransmitting as long as we're sending from
3340 * the retransmit queue. Keeping retransmits non-zero
3341 * prevents us from getting new data interspersed with
3342 * retransmissions.
3343 */
3344
3345 if (sk->send_head->link3) /* Any more queued retransmits? */
3346 sk->retransmits = 1;
3347 else
3348 sk->retransmits = 0;
3349 }
3350 /*
3351 * Note that we only reset backoff and rto in the
3352 * rtt recomputation code. And that doesn't happen
3353 * if there were retransmissions in effect. So the
3354 * first new packet after the retransmissions is
3355 * sent with the backoff still in effect. Not until
3356 * we get an ack from a non-retransmitted packet do
3357 * we reset the backoff and rto. This allows us to deal
3358 * with a situation where the network delay has increased
3359 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3360 */
3361
3362 /*
3363 * We have one less packet out there.
3364 */
3365
3366 if (sk->packets_out > 0)
3367 sk->packets_out --;
3368 /*
3369 * Wake up the process, it can probably write more.
3370 */
3371 if (!sk->dead)
3372 sk->write_space(sk);
3373 oskb = sk->send_head;
3374
3375 if (!(flag&2)) /* Not retransmitting */
3376 {
3377 long m;
3378
3379 /*
3380 * The following amusing code comes from Jacobson's
3381 * article in SIGCOMM '88. Note that rtt and mdev
3382 * are scaled versions of rtt and mean deviation.
3383 * This is designed to be as fast as possible
3384 * m stands for "measurement".
3385 */
3386
3387 m = jiffies - oskb->when; /* RTT */
3388 if(m<=0)
3389 m=1; /* IS THIS RIGHT FOR <0 ??? */
3390 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3391 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3392 if (m < 0)
3393 m = -m; /* m is now abs(error) */
3394 m -= (sk->mdev >> 2); /* similar update on mdev */
3395 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3396
3397 /*
3398 * Now update timeout. Note that this removes any backoff.
3399 */
3400
3401 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3402 if (sk->rto > 120*HZ)
3403 sk->rto = 120*HZ;
3404 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3405 sk->rto = 20;
3406 sk->backoff = 0;
3407 }
3408 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3409 In this case as we just set it up */
3410 cli();
3411 oskb = sk->send_head;
3412 IS_SKB(oskb);
3413 sk->send_head = oskb->link3;
3414 if (sk->send_head == NULL)
3415 {
3416 sk->send_tail = NULL;
3417 }
3418
3419 /*
3420 * We may need to remove this from the dev send list.
3421 */
3422
3423 if (oskb->next)
3424 skb_unlink(oskb);
3425 sti();
3426 kfree_skb(oskb, FREE_WRITE); /* write. */
3427 if (!sk->dead)
3428 sk->write_space(sk);
3429 }
3430 else
3431 {
3432 break;
3433 }
3434 }
3435
3436 /*
3437 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3438 * returns non-NULL, we complete ignore the timer stuff in the else
3439 * clause. We ought to organize the code so that else clause can
3440 * (should) be executed regardless, possibly moving the PROBE timer
3441 * reset over. The skb_peek() thing should only move stuff to the
3442 * write queue, NOT also manage the timer functions.
3443 */
3444
3445 /*
3446 * Maybe we can take some stuff off of the write queue,
3447 * and put it onto the xmit queue.
3448 */
3449 if (skb_peek(&sk->write_queue) != NULL)
3450 {
3451 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3452 (sk->retransmits == 0 ||
3453 sk->ip_xmit_timeout != TIME_WRITE ||
3454 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3455 && sk->packets_out < sk->cong_window)
3456 {
3457 /*
3458 * Add more data to the send queue.
3459 */
3460 flag |= 1;
3461 tcp_write_xmit(sk);
3462 }
3463 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3464 sk->send_head == NULL &&
3465 sk->ack_backlog == 0 &&
3466 sk->state != TCP_TIME_WAIT)
3467 {
3468 /*
3469 * Data to queue but no room.
3470 */
3471 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3472 }
3473 }
3474 else
3475 {
3476 /*
3477 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3478 * from TCP_CLOSE we don't do anything
3479 *
3480 * from anything else, if there is write data (or fin) pending,
3481 * we use a TIME_WRITE timeout, else if keepalive we reset to
3482 * a KEEPALIVE timeout, else we delete the timer.
3483 *
3484 * We do not set flag for nominal write data, otherwise we may
3485 * force a state where we start to write itsy bitsy tidbits
3486 * of data.
3487 */
3488
3489 switch(sk->state) {
3490 case TCP_TIME_WAIT:
3491 /*
3492 * keep us in TIME_WAIT until we stop getting packets,
3493 * reset the timeout.
3494 */
3495 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3496 break;
3497 case TCP_CLOSE:
3498 /*
3499 * don't touch the timer.
3500 */
3501 break;
3502 default:
3503 /*
3504 * Must check send_head, write_queue, and ack_backlog
3505 * to determine which timeout to use.
3506 */
3507 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3508 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3509 } else if (sk->keepopen) {
3510 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3511 } else {
3512 del_timer(&sk->retransmit_timer);
3513 sk->ip_xmit_timeout = 0;
3514 }
3515 break;
3516 }
3517 }
3518
3519 /*
3520 * We have nothing queued but space to send. Send any partial
3521 * packets immediately (end of Nagle rule application).
3522 */
3523
3524 if (sk->packets_out == 0 && sk->partial != NULL &&
3525 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3526 {
3527 flag |= 1;
3528 tcp_send_partial(sk);
3529 }
3530
3531 /*
3532 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3533 * we are now waiting for an acknowledge to our FIN. The other end is
3534 * already in TIME_WAIT.
3535 *
3536 * Move to TCP_CLOSE on success.
3537 */
3538
3539 if (sk->state == TCP_LAST_ACK)
3540 {
3541 if (!sk->dead)
3542 sk->state_change(sk);
3543 if(sk->debug)
3544 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3545 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3546 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3547 {
3548 flag |= 1;
3549 tcp_set_state(sk,TCP_CLOSE);
3550 sk->shutdown = SHUTDOWN_MASK;
3551 }
3552 }
3553
3554 /*
3555 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3556 *
3557 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3558 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3559 */
3560
3561 if (sk->state == TCP_FIN_WAIT1)
3562 {
3563
3564 if (!sk->dead)
3565 sk->state_change(sk);
3566 if (sk->rcv_ack_seq == sk->write_seq)
3567 {
3568 flag |= 1;
3569 sk->shutdown |= SEND_SHUTDOWN;
3570 tcp_set_state(sk, TCP_FIN_WAIT2);
3571 }
3572 }
3573
3574 /*
3575 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3576 *
3577 * Move to TIME_WAIT
3578 */
3579
3580 if (sk->state == TCP_CLOSING)
3581 {
3582
3583 if (!sk->dead)
3584 sk->state_change(sk);
3585 if (sk->rcv_ack_seq == sk->write_seq)
3586 {
3587 flag |= 1;
3588 tcp_time_wait(sk);
3589 }
3590 }
3591
3592 /*
3593 * Final ack of a three way shake
3594 */
3595
3596 if(sk->state==TCP_SYN_RECV)
3597 {
3598 tcp_set_state(sk, TCP_ESTABLISHED);
3599 tcp_options(sk,th);
3600 sk->dummy_th.dest=th->source;
3601 sk->copied_seq = sk->acked_seq;
3602 if(!sk->dead)
3603 sk->state_change(sk);
3604 if(sk->max_window==0)
3605 {
3606 sk->max_window=32; /* Sanity check */
3607 sk->mss=min(sk->max_window,sk->mtu);
3608 }
3609 }
3610
3611 /*
3612 * I make no guarantees about the first clause in the following
3613 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3614 * what conditions "!flag" would be true. However I think the rest
3615 * of the conditions would prevent that from causing any
3616 * unnecessary retransmission.
3617 * Clearly if the first packet has expired it should be
3618 * retransmitted. The other alternative, "flag&2 && retransmits", is
3619 * harder to explain: You have to look carefully at how and when the
3620 * timer is set and with what timeout. The most recent transmission always
3621 * sets the timer. So in general if the most recent thing has timed
3622 * out, everything before it has as well. So we want to go ahead and
3623 * retransmit some more. If we didn't explicitly test for this
3624 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3625 * would not be true. If you look at the pattern of timing, you can
3626 * show that rto is increased fast enough that the next packet would
3627 * almost never be retransmitted immediately. Then you'd end up
3628 * waiting for a timeout to send each packet on the retransmission
3629 * queue. With my implementation of the Karn sampling algorithm,
3630 * the timeout would double each time. The net result is that it would
3631 * take a hideous amount of time to recover from a single dropped packet.
3632 * It's possible that there should also be a test for TIME_WRITE, but
3633 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3634 * got to be in real retransmission mode.
3635 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3636 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3637 * As long as no further losses occur, this seems reasonable.
3638 */
3639
3640 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3641 (((flag&2) && sk->retransmits) ||
3642 (sk->send_head->when + sk->rto < jiffies)))
3643 {
3644 if(sk->send_head->when + sk->rto < jiffies)
3645 tcp_retransmit(sk,0);
3646 else
3647 {
3648 tcp_do_retransmit(sk, 1);
3649 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3650 }
3651 }
3652
3653 return(1);
3654 }
3655
3656
3657 /*
3658 * Process the FIN bit. This now behaves as it is supposed to work
3659 * and the FIN takes effect when it is validly part of sequence
3660 * space. Not before when we get holes.
3661 *
3662 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3663 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3664 * TIME-WAIT)
3665 *
3666 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3667 * close and we go into CLOSING (and later onto TIME-WAIT)
3668 *
3669 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3670 *
3671 */
3672
3673 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3674 {
3675 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3676
3677 if (!sk->dead)
3678 {
3679 sk->state_change(sk);
3680 sock_wake_async(sk->socket, 1);
3681 }
3682
3683 switch(sk->state)
3684 {
3685 case TCP_SYN_RECV:
3686 case TCP_SYN_SENT:
3687 case TCP_ESTABLISHED:
3688 /*
3689 * move to CLOSE_WAIT, tcp_data() already handled
3690 * sending the ack.
3691 */
3692 tcp_set_state(sk,TCP_CLOSE_WAIT);
3693 if (th->rst)
3694 sk->shutdown = SHUTDOWN_MASK;
3695 break;
3696
3697 case TCP_CLOSE_WAIT:
3698 case TCP_CLOSING:
3699 /*
3700 * received a retransmission of the FIN, do
3701 * nothing.
3702 */
3703 break;
3704 case TCP_TIME_WAIT:
3705 /*
3706 * received a retransmission of the FIN,
3707 * restart the TIME_WAIT timer.
3708 */
3709 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3710 return(0);
3711 case TCP_FIN_WAIT1:
3712 /*
3713 * This case occurs when a simultaneous close
3714 * happens, we must ack the received FIN and
3715 * enter the CLOSING state.
3716 *
3717 * This causes a WRITE timeout, which will either
3718 * move on to TIME_WAIT when we timeout, or resend
3719 * the FIN properly (maybe we get rid of that annoying
3720 * FIN lost hang). The TIME_WRITE code is already correct
3721 * for handling this timeout.
3722 */
3723
3724 if(sk->ip_xmit_timeout != TIME_WRITE)
3725 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3726 tcp_set_state(sk,TCP_CLOSING);
3727 break;
3728 case TCP_FIN_WAIT2:
3729 /*
3730 * received a FIN -- send ACK and enter TIME_WAIT
3731 */
3732 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3733 sk->shutdown|=SHUTDOWN_MASK;
3734 tcp_set_state(sk,TCP_TIME_WAIT);
3735 break;
3736 case TCP_CLOSE:
3737 /*
3738 * already in CLOSE
3739 */
3740 break;
3741 default:
3742 tcp_set_state(sk,TCP_LAST_ACK);
3743
3744 /* Start the timers. */
3745 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3746 return(0);
3747 }
3748
3749 return(0);
3750 }
3751
3752
3753
3754 /*
3755 * This routine handles the data. If there is room in the buffer,
3756 * it will be have already been moved into it. If there is no
3757 * room, then we will just have to discard the packet.
3758 */
3759
3760 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3761 unsigned long saddr, unsigned short len)
3762 {
3763 struct sk_buff *skb1, *skb2;
3764 struct tcphdr *th;
3765 int dup_dumped=0;
3766 u32 new_seq, shut_seq;
3767
3768 th = skb->h.th;
3769 skb->len = len -(th->doff*4);
3770
3771 /*
3772 * The bytes in the receive read/assembly queue has increased. Needed for the
3773 * low memory discard algorithm
3774 */
3775
3776 sk->bytes_rcv += skb->len;
3777
3778 if (skb->len == 0 && !th->fin)
3779 {
3780 /*
3781 * Don't want to keep passing ack's back and forth.
3782 * (someone sent us dataless, boring frame)
3783 */
3784 if (!th->ack)
3785 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
3786 kfree_skb(skb, FREE_READ);
3787 return(0);
3788 }
3789
3790 /*
3791 * We no longer have anyone receiving data on this connection.
3792 */
3793
3794 #ifndef TCP_DONT_RST_SHUTDOWN
3795
3796 if(sk->shutdown & RCV_SHUTDOWN)
3797 {
3798 /*
3799 * FIXME: BSD has some magic to avoid sending resets to
3800 * broken 4.2 BSD keepalives. Much to my surprise a few non
3801 * BSD stacks still have broken keepalives so we want to
3802 * cope with it.
3803 */
3804
3805 if(skb->len) /* We don't care if it's just an ack or
3806 a keepalive/window probe */
3807 {
3808 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
3809
3810 /* Do this the way 4.4BSD treats it. Not what I'd
3811 regard as the meaning of the spec but it's what BSD
3812 does and clearly they know everything 8) */
3813
3814 /*
3815 * This is valid because of two things
3816 *
3817 * a) The way tcp_data behaves at the bottom.
3818 * b) A fin takes effect when read not when received.
3819 */
3820
3821 shut_seq=sk->acked_seq+1; /* Last byte */
3822
3823 if(after(new_seq,shut_seq))
3824 {
3825 if(sk->debug)
3826 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
3827 sk, new_seq, shut_seq, sk->blog);
3828 if(sk->dead)
3829 {
3830 sk->acked_seq = new_seq + th->fin;
3831 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
3832 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
3833 tcp_statistics.TcpEstabResets++;
3834 tcp_set_state(sk,TCP_CLOSE);
3835 sk->err = EPIPE;
3836 sk->shutdown = SHUTDOWN_MASK;
3837 kfree_skb(skb, FREE_READ);
3838 return 0;
3839 }
3840 }
3841 }
3842 }
3843
3844 #endif
3845
3846 /*
3847 * Now we have to walk the chain, and figure out where this one
3848 * goes into it. This is set up so that the last packet we received
3849 * will be the first one we look at, that way if everything comes
3850 * in order, there will be no performance loss, and if they come
3851 * out of order we will be able to fit things in nicely.
3852 *
3853 * [AC: This is wrong. We should assume in order first and then walk
3854 * forwards from the first hole based upon real traffic patterns.]
3855 *
3856 */
3857
3858 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
3859 {
3860 skb_queue_head(&sk->receive_queue,skb);
3861 skb1= NULL;
3862 }
3863 else
3864 {
3865 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
3866 {
3867 if(sk->debug)
3868 {
3869 printk("skb1=%p :", skb1);
3870 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
3871 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
3872 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
3873 sk->acked_seq);
3874 }
3875
3876 /*
3877 * Optimisation: Duplicate frame or extension of previous frame from
3878 * same sequence point (lost ack case).
3879 * The frame contains duplicate data or replaces a previous frame
3880 * discard the previous frame (safe as sk->inuse is set) and put
3881 * the new one in its place.
3882 */
3883
3884 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
3885 {
3886 skb_append(skb1,skb);
3887 skb_unlink(skb1);
3888 kfree_skb(skb1,FREE_READ);
3889 dup_dumped=1;
3890 skb1=NULL;
3891 break;
3892 }
3893
3894 /*
3895 * Found where it fits
3896 */
3897
3898 if (after(th->seq+1, skb1->h.th->seq))
3899 {
3900 skb_append(skb1,skb);
3901 break;
3902 }
3903
3904 /*
3905 * See if we've hit the start. If so insert.
3906 */
3907 if (skb1 == skb_peek(&sk->receive_queue))
3908 {
3909 skb_queue_head(&sk->receive_queue, skb);
3910 break;
3911 }
3912 }
3913 }
3914
3915 /*
3916 * Figure out what the ack value for this frame is
3917 */
3918
3919 th->ack_seq = th->seq + skb->len;
3920 if (th->syn)
3921 th->ack_seq++;
3922 if (th->fin)
3923 th->ack_seq++;
3924
3925 if (before(sk->acked_seq, sk->copied_seq))
3926 {
3927 printk("*** tcp.c:tcp_data bug acked < copied\n");
3928 sk->acked_seq = sk->copied_seq;
3929 }
3930
3931 /*
3932 * Now figure out if we can ack anything. This is very messy because we really want two
3933 * receive queues, a completed and an assembly queue. We also want only one transmit
3934 * queue.
3935 */
3936
3937 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
3938 {
3939 if (before(th->seq, sk->acked_seq+1))
3940 {
3941 int newwindow;
3942
3943 if (after(th->ack_seq, sk->acked_seq))
3944 {
3945 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
3946 if (newwindow < 0)
3947 newwindow = 0;
3948 sk->window = newwindow;
3949 sk->acked_seq = th->ack_seq;
3950 }
3951 skb->acked = 1;
3952
3953 /*
3954 * When we ack the fin, we do the FIN
3955 * processing.
3956 */
3957
3958 if (skb->h.th->fin)
3959 {
3960 tcp_fin(skb,sk,skb->h.th);
3961 }
3962
3963 for(skb2 = skb->next;
3964 skb2 != (struct sk_buff *)&sk->receive_queue;
3965 skb2 = skb2->next)
3966 {
3967 if (before(skb2->h.th->seq, sk->acked_seq+1))
3968 {
3969 if (after(skb2->h.th->ack_seq, sk->acked_seq))
3970 {
3971 newwindow = sk->window -
3972 (skb2->h.th->ack_seq - sk->acked_seq);
3973 if (newwindow < 0)
3974 newwindow = 0;
3975 sk->window = newwindow;
3976 sk->acked_seq = skb2->h.th->ack_seq;
3977 }
3978 skb2->acked = 1;
3979 /*
3980 * When we ack the fin, we do
3981 * the fin handling.
3982 */
3983 if (skb2->h.th->fin)
3984 {
3985 tcp_fin(skb,sk,skb->h.th);
3986 }
3987
3988 /*
3989 * Force an immediate ack.
3990 */
3991
3992 sk->ack_backlog = sk->max_ack_backlog;
3993 }
3994 else
3995 {
3996 break;
3997 }
3998 }
3999
4000 /*
4001 * This also takes care of updating the window.
4002 * This if statement needs to be simplified.
4003 */
4004 if (!sk->delay_acks ||
4005 sk->ack_backlog >= sk->max_ack_backlog ||
4006 sk->bytes_rcv > sk->max_unacked || th->fin) {
4007 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4008 }
4009 else
4010 {
4011 sk->ack_backlog++;
4012 if(sk->debug)
4013 printk("Ack queued.\n");
4014 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4015 }
4016 }
4017 }
4018
4019 /*
4020 * If we've missed a packet, send an ack.
4021 * Also start a timer to send another.
4022 */
4023
4024 if (!skb->acked)
4025 {
4026
4027 /*
4028 * This is important. If we don't have much room left,
4029 * we need to throw out a few packets so we have a good
4030 * window. Note that mtu is used, not mss, because mss is really
4031 * for the send side. He could be sending us stuff as large as mtu.
4032 */
4033
4034 while (sk->prot->rspace(sk) < sk->mtu)
4035 {
4036 skb1 = skb_peek(&sk->receive_queue);
4037 if (skb1 == NULL)
4038 {
4039 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4040 break;
4041 }
4042
4043 /*
4044 * Don't throw out something that has been acked.
4045 */
4046
4047 if (skb1->acked)
4048 {
4049 break;
4050 }
4051
4052 skb_unlink(skb1);
4053 kfree_skb(skb1, FREE_READ);
4054 }
4055 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4056 sk->ack_backlog++;
4057 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4058 }
4059 else
4060 {
4061 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4062 }
4063
4064 /*
4065 * Now tell the user we may have some data.
4066 */
4067
4068 if (!sk->dead)
4069 {
4070 if(sk->debug)
4071 printk("Data wakeup.\n");
4072 sk->data_ready(sk,0);
4073 }
4074 return(0);
4075 }
4076
4077
4078 /*
4079 * This routine is only called when we have urgent data
4080 * signalled. Its the 'slow' part of tcp_urg. It could be
4081 * moved inline now as tcp_urg is only called from one
4082 * place. We handle URGent data wrong. We have to - as
4083 * BSD still doesn't use the correction from RFC961.
4084 */
4085
4086 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4087 {
4088 unsigned long ptr = ntohs(th->urg_ptr);
4089
4090 if (ptr)
4091 ptr--;
4092 ptr += th->seq;
4093
4094 /* ignore urgent data that we've already seen and read */
4095 if (after(sk->copied_seq, ptr))
4096 return;
4097
4098 /* do we already have a newer (or duplicate) urgent pointer? */
4099 if (sk->urg_data && !after(ptr, sk->urg_seq))
4100 return;
4101
4102 /* tell the world about our new urgent pointer */
4103 if (sk->proc != 0) {
4104 if (sk->proc > 0) {
4105 kill_proc(sk->proc, SIGURG, 1);
4106 } else {
4107 kill_pg(-sk->proc, SIGURG, 1);
4108 }
4109 }
4110 sk->urg_data = URG_NOTYET;
4111 sk->urg_seq = ptr;
4112 }
4113
4114 /*
4115 * This is the 'fast' part of urgent handling.
4116 */
4117
4118 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4119 unsigned long saddr, unsigned long len)
4120 {
4121 unsigned long ptr;
4122
4123 /*
4124 * Check if we get a new urgent pointer - normally not
4125 */
4126
4127 if (th->urg)
4128 tcp_check_urg(sk,th);
4129
4130 /*
4131 * Do we wait for any urgent data? - normally not
4132 */
4133
4134 if (sk->urg_data != URG_NOTYET)
4135 return 0;
4136
4137 /*
4138 * Is the urgent pointer pointing into this packet?
4139 */
4140
4141 ptr = sk->urg_seq - th->seq + th->doff*4;
4142 if (ptr >= len)
4143 return 0;
4144
4145 /*
4146 * Ok, got the correct packet, update info
4147 */
4148
4149 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4150 if (!sk->dead)
4151 sk->data_ready(sk,0);
4152 return 0;
4153 }
4154
4155 /*
4156 * This will accept the next outstanding connection.
4157 */
4158
4159 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4160 {
4161 struct sock *newsk;
4162 struct sk_buff *skb;
4163
4164 /*
4165 * We need to make sure that this socket is listening,
4166 * and that it has something pending.
4167 */
4168
4169 if (sk->state != TCP_LISTEN)
4170 {
4171 sk->err = EINVAL;
4172 return(NULL);
4173 }
4174
4175 /* Avoid the race. */
4176 cli();
4177 sk->inuse = 1;
4178
4179 while((skb = tcp_dequeue_established(sk)) == NULL)
4180 {
4181 if (flags & O_NONBLOCK)
4182 {
4183 sti();
4184 release_sock(sk);
4185 sk->err = EAGAIN;
4186 return(NULL);
4187 }
4188
4189 release_sock(sk);
4190 interruptible_sleep_on(sk->sleep);
4191 if (current->signal & ~current->blocked)
4192 {
4193 sti();
4194 sk->err = ERESTARTSYS;
4195 return(NULL);
4196 }
4197 sk->inuse = 1;
4198 }
4199 sti();
4200
4201 /*
4202 * Now all we need to do is return skb->sk.
4203 */
4204
4205 newsk = skb->sk;
4206
4207 kfree_skb(skb, FREE_READ);
4208 sk->ack_backlog--;
4209 release_sock(sk);
4210 return(newsk);
4211 }
4212
4213
4214 /*
4215 * This will initiate an outgoing connection.
4216 */
4217
4218 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4219 {
4220 struct sk_buff *buff;
4221 struct device *dev=NULL;
4222 unsigned char *ptr;
4223 int tmp;
4224 int atype;
4225 struct tcphdr *t1;
4226 struct rtable *rt;
4227
4228 if (sk->state != TCP_CLOSE)
4229 {
4230 return(-EISCONN);
4231 }
4232
4233 if (addr_len < 8)
4234 return(-EINVAL);
4235
4236 if (usin->sin_family && usin->sin_family != AF_INET)
4237 return(-EAFNOSUPPORT);
4238
4239 /*
4240 * connect() to INADDR_ANY means loopback (BSD'ism).
4241 */
4242
4243 if(usin->sin_addr.s_addr==INADDR_ANY)
4244 usin->sin_addr.s_addr=ip_my_addr();
4245
4246 /*
4247 * Don't want a TCP connection going to a broadcast address
4248 */
4249
4250 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4251 return -ENETUNREACH;
4252
4253 sk->inuse = 1;
4254 sk->daddr = usin->sin_addr.s_addr;
4255 sk->write_seq = tcp_init_seq();
4256 sk->window_seq = sk->write_seq;
4257 sk->rcv_ack_seq = sk->write_seq -1;
4258 sk->err = 0;
4259 sk->dummy_th.dest = usin->sin_port;
4260 release_sock(sk);
4261
4262 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4263 if (buff == NULL)
4264 {
4265 return(-ENOMEM);
4266 }
4267 sk->inuse = 1;
4268 buff->len = 24;
4269 buff->sk = sk;
4270 buff->free = 0;
4271 buff->localroute = sk->localroute;
4272
4273 t1 = (struct tcphdr *) buff->data;
4274
4275 /*
4276 * Put in the IP header and routing stuff.
4277 */
4278
4279 rt=ip_rt_route(sk->daddr, NULL, NULL);
4280
4281
4282 /*
4283 * We need to build the routing stuff from the things saved in skb.
4284 */
4285
4286 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4287 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4288 if (tmp < 0)
4289 {
4290 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4291 release_sock(sk);
4292 return(-ENETUNREACH);
4293 }
4294
4295 buff->len += tmp;
4296 t1 = (struct tcphdr *)((char *)t1 +tmp);
4297
4298 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4299 t1->seq = ntohl(sk->write_seq++);
4300 sk->sent_seq = sk->write_seq;
4301 buff->h.seq = sk->write_seq;
4302 t1->ack = 0;
4303 t1->window = 2;
4304 t1->res1=0;
4305 t1->res2=0;
4306 t1->rst = 0;
4307 t1->urg = 0;
4308 t1->psh = 0;
4309 t1->syn = 1;
4310 t1->urg_ptr = 0;
4311 t1->doff = 6;
4312 /* use 512 or whatever user asked for */
4313
4314 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4315 sk->window_clamp=rt->rt_window;
4316 else
4317 sk->window_clamp=0;
4318
4319 if (sk->user_mss)
4320 sk->mtu = sk->user_mss;
4321 else if(rt!=NULL && (rt->rt_flags&RTF_MTU))
4322 sk->mtu = rt->rt_mss;
4323 else
4324 {
4325 #ifdef CONFIG_INET_SNARL
4326 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4327 #else
4328 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4329 #endif
4330 sk->mtu = 576 - HEADER_SIZE;
4331 else
4332 sk->mtu = MAX_WINDOW;
4333 }
4334 /*
4335 * but not bigger than device MTU
4336 */
4337
4338 if(sk->mtu <32)
4339 sk->mtu = 32; /* Sanity limit */
4340
4341 sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);
4342
4343 /*
4344 * Put in the TCP options to say MTU.
4345 */
4346
4347 ptr = (unsigned char *)(t1+1);
4348 ptr[0] = 2;
4349 ptr[1] = 4;
4350 ptr[2] = (sk->mtu) >> 8;
4351 ptr[3] = (sk->mtu) & 0xff;
4352 tcp_send_check(t1, sk->saddr, sk->daddr,
4353 sizeof(struct tcphdr) + 4, sk);
4354
4355 /*
4356 * This must go first otherwise a really quick response will get reset.
4357 */
4358
4359 tcp_cache_zap();
4360 tcp_set_state(sk,TCP_SYN_SENT);
4361 if(rt&&rt->rt_flags&RTF_IRTT)
4362 sk->rto = rt->rt_irtt;
4363 else
4364 sk->rto = TCP_TIMEOUT_INIT;
4365 sk->retransmit_timer.function=&retransmit_timer;
4366 sk->retransmit_timer.data = (unsigned long)sk;
4367 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4368 sk->retransmits = TCP_SYN_RETRIES;
4369
4370 sk->prot->queue_xmit(sk, dev, buff, 0);
4371 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4372 tcp_statistics.TcpActiveOpens++;
4373 tcp_statistics.TcpOutSegs++;
4374
4375 release_sock(sk);
4376 return(0);
4377 }
4378
4379
4380 /* This functions checks to see if the tcp header is actually acceptable. */
4381 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4382 struct options *opt, unsigned long saddr, struct device *dev)
4383 {
4384 u32 next_seq;
4385
4386 next_seq = len - 4*th->doff;
4387 if (th->fin)
4388 next_seq++;
4389 /* if we have a zero window, we can't have any data in the packet.. */
4390 if (next_seq && !sk->window)
4391 goto ignore_it;
4392 next_seq += th->seq;
4393
4394 /*
4395 * This isn't quite right. sk->acked_seq could be more recent
4396 * than sk->window. This is however close enough. We will accept
4397 * slightly more packets than we should, but it should not cause
4398 * problems unless someone is trying to forge packets.
4399 */
4400
4401 /* have we already seen all of this packet? */
4402 if (!after(next_seq+1, sk->acked_seq))
4403 goto ignore_it;
4404 /* or does it start beyond the window? */
4405 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4406 goto ignore_it;
4407
4408 /* ok, at least part of this packet would seem interesting.. */
4409 return 1;
4410
4411 ignore_it:
4412 if (th->rst)
4413 return 0;
4414
4415 /*
4416 * Send a reset if we get something not ours and we are
4417 * unsynchronized. Note: We don't do anything to our end. We
4418 * are just killing the bogus remote connection then we will
4419 * connect again and it will work (with luck).
4420 */
4421
4422 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4423 {
4424 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4425 return 1;
4426 }
4427
4428 /* Try to resync things. */
4429 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4430 return 0;
4431 }
4432
4433 /*
4434 * When we get a reset we do this.
4435 */
4436
4437 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4438 {
4439 sk->zapped = 1;
4440 sk->err = ECONNRESET;
4441 if (sk->state == TCP_SYN_SENT)
4442 sk->err = ECONNREFUSED;
4443 if (sk->state == TCP_CLOSE_WAIT)
4444 sk->err = EPIPE;
4445 #ifdef TCP_DO_RFC1337
4446 /*
4447 * Time wait assassination protection [RFC1337]
4448 */
4449 if(sk->state!=TCP_TIME_WAIT)
4450 {
4451 tcp_set_state(sk,TCP_CLOSE);
4452 sk->shutdown = SHUTDOWN_MASK;
4453 }
4454 #else
4455 tcp_set_state(sk,TCP_CLOSE);
4456 sk->shutdown = SHUTDOWN_MASK;
4457 #endif
4458 if (!sk->dead)
4459 sk->state_change(sk);
4460 kfree_skb(skb, FREE_READ);
4461 release_sock(sk);
4462 return(0);
4463 }
4464
4465 /*
4466 * A TCP packet has arrived.
4467 */
4468
4469 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4470 unsigned long daddr, unsigned short len,
4471 unsigned long saddr, int redo, struct inet_protocol * protocol)
4472 {
4473 struct tcphdr *th;
4474 struct sock *sk;
4475 int syn_ok=0;
4476
4477 tcp_statistics.TcpInSegs++;
4478
4479 if(skb->pkt_type!=PACKET_HOST)
4480 {
4481 kfree_skb(skb,FREE_READ);
4482 return(0);
4483 }
4484
4485 th = skb->h.th;
4486
4487 /*
4488 * Find the socket, using the last hit cache if applicable.
4489 */
4490
4491 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4492 sk=(struct sock *)th_cache_sk;
4493 else
4494 {
4495 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4496 th_cache_saddr=saddr;
4497 th_cache_daddr=daddr;
4498 th_cache_dport=th->dest;
4499 th_cache_sport=th->source;
4500 th_cache_sk=sk;
4501 }
4502
4503 /*
4504 * If this socket has got a reset it's to all intents and purposes
4505 * really dead. Count closed sockets as dead.
4506 *
4507 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4508 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4509 * exist so should cause resets as if the port was unreachable.
4510 */
4511
4512 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4513 sk=NULL;
4514
4515 if (!redo)
4516 {
4517 if (tcp_check(th, len, saddr, daddr ))
4518 {
4519 skb->sk = NULL;
4520 kfree_skb(skb,FREE_READ);
4521 /*
4522 * We don't release the socket because it was
4523 * never marked in use.
4524 */
4525 return(0);
4526 }
4527 th->seq = ntohl(th->seq);
4528
4529 /* See if we know about the socket. */
4530 if (sk == NULL)
4531 {
4532 /*
4533 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4534 */
4535 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4536 skb->sk = NULL;
4537 /*
4538 * Discard frame
4539 */
4540 kfree_skb(skb, FREE_READ);
4541 return(0);
4542 }
4543
4544 skb->len = len;
4545 skb->acked = 0;
4546 skb->used = 0;
4547 skb->free = 0;
4548 skb->saddr = daddr;
4549 skb->daddr = saddr;
4550
4551 /* We may need to add it to the backlog here. */
4552 cli();
4553 if (sk->inuse)
4554 {
4555 skb_queue_tail(&sk->back_log, skb);
4556 sti();
4557 return(0);
4558 }
4559 sk->inuse = 1;
4560 sti();
4561 }
4562 else
4563 {
4564 if (sk==NULL)
4565 {
4566 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4567 skb->sk = NULL;
4568 kfree_skb(skb, FREE_READ);
4569 return(0);
4570 }
4571 }
4572
4573
4574 if (!sk->prot)
4575 {
4576 printk("IMPOSSIBLE 3\n");
4577 return(0);
4578 }
4579
4580
4581 /*
4582 * Charge the memory to the socket.
4583 */
4584
4585 if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
4586 {
4587 kfree_skb(skb, FREE_READ);
4588 release_sock(sk);
4589 return(0);
4590 }
4591
4592 skb->sk=sk;
4593 sk->rmem_alloc += skb->mem_len;
4594
4595 /*
4596 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4597 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4598 * compatibility. We also set up variables more thoroughly [Karn notes in the
4599 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4600 */
4601
4602 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4603 {
4604
4605 /*
4606 * Now deal with unusual cases.
4607 */
4608
4609 if(sk->state==TCP_LISTEN)
4610 {
4611 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4612 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4613
4614 /*
4615 * We don't care for RST, and non SYN are absorbed (old segments)
4616 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4617 * netmask on a running connection it can go broadcast. Even Sun's have
4618 * this problem so I'm ignoring it
4619 */
4620
4621 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4622 {
4623 kfree_skb(skb, FREE_READ);
4624 release_sock(sk);
4625 return 0;
4626 }
4627
4628 /*
4629 * Guess we need to make a new socket up
4630 */
4631
4632 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4633
4634 /*
4635 * Now we have several options: In theory there is nothing else
4636 * in the frame. KA9Q has an option to send data with the syn,
4637 * BSD accepts data with the syn up to the [to be] advertised window
4638 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4639 * it, that fits the spec precisely and avoids incompatibilities. It
4640 * would be nice in future to drop through and process the data.
4641 */
4642
4643 release_sock(sk);
4644 return 0;
4645 }
4646
4647 /* retransmitted SYN? */
4648 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4649 {
4650 kfree_skb(skb, FREE_READ);
4651 release_sock(sk);
4652 return 0;
4653 }
4654
4655 /*
4656 * SYN sent means we have to look for a suitable ack and either reset
4657 * for bad matches or go to connected
4658 */
4659
4660 if(sk->state==TCP_SYN_SENT)
4661 {
4662 /* Crossed SYN or previous junk segment */
4663 if(th->ack)
4664 {
4665 /* We got an ack, but it's not a good ack */
4666 if(!tcp_ack(sk,th,saddr,len))
4667 {
4668 /* Reset the ack - its an ack from a
4669 different connection [ th->rst is checked in tcp_reset()] */
4670 tcp_statistics.TcpAttemptFails++;
4671 tcp_reset(daddr, saddr, th,
4672 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4673 kfree_skb(skb, FREE_READ);
4674 release_sock(sk);
4675 return(0);
4676 }
4677 if(th->rst)
4678 return tcp_std_reset(sk,skb);
4679 if(!th->syn)
4680 {
4681 /* A valid ack from a different connection
4682 start. Shouldn't happen but cover it */
4683 kfree_skb(skb, FREE_READ);
4684 release_sock(sk);
4685 return 0;
4686 }
4687 /*
4688 * Ok.. it's good. Set up sequence numbers and
4689 * move to established.
4690 */
4691 syn_ok=1; /* Don't reset this connection for the syn */
4692 sk->acked_seq=th->seq+1;
4693 sk->fin_seq=th->seq;
4694 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4695 tcp_set_state(sk, TCP_ESTABLISHED);
4696 tcp_options(sk,th);
4697 sk->dummy_th.dest=th->source;
4698 sk->copied_seq = sk->acked_seq;
4699 if(!sk->dead)
4700 {
4701 sk->state_change(sk);
4702 sock_wake_async(sk->socket, 0);
4703 }
4704 if(sk->max_window==0)
4705 {
4706 sk->max_window = 32;
4707 sk->mss = min(sk->max_window, sk->mtu);
4708 }
4709 }
4710 else
4711 {
4712 /* See if SYN's cross. Drop if boring */
4713 if(th->syn && !th->rst)
4714 {
4715 /* Crossed SYN's are fine - but talking to
4716 yourself is right out... */
4717 if(sk->saddr==saddr && sk->daddr==daddr &&
4718 sk->dummy_th.source==th->source &&
4719 sk->dummy_th.dest==th->dest)
4720 {
4721 tcp_statistics.TcpAttemptFails++;
4722 return tcp_std_reset(sk,skb);
4723 }
4724 tcp_set_state(sk,TCP_SYN_RECV);
4725
4726 /*
4727 * FIXME:
4728 * Must send SYN|ACK here
4729 */
4730 }
4731 /* Discard junk segment */
4732 kfree_skb(skb, FREE_READ);
4733 release_sock(sk);
4734 return 0;
4735 }
4736 /*
4737 * SYN_RECV with data maybe.. drop through
4738 */
4739 goto rfc_step6;
4740 }
4741
4742 /*
4743 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4744 * a more complex suggestion for fixing these reuse issues in RFC1644
4745 * but not yet ready for general use. Also see RFC1379.
4746 */
4747
4748 #define BSD_TIME_WAIT
4749 #ifdef BSD_TIME_WAIT
4750 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4751 after(th->seq, sk->acked_seq) && !th->rst)
4752 {
4753 u32 seq = sk->write_seq;
4754 if(sk->debug)
4755 printk("Doing a BSD time wait\n");
4756 tcp_statistics.TcpEstabResets++;
4757 sk->rmem_alloc -= skb->mem_len;
4758 skb->sk = NULL;
4759 sk->err=ECONNRESET;
4760 tcp_set_state(sk, TCP_CLOSE);
4761 sk->shutdown = SHUTDOWN_MASK;
4762 release_sock(sk);
4763 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4764 if (sk && sk->state==TCP_LISTEN)
4765 {
4766 sk->inuse=1;
4767 skb->sk = sk;
4768 sk->rmem_alloc += skb->mem_len;
4769 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
4770 release_sock(sk);
4771 return 0;
4772 }
4773 kfree_skb(skb, FREE_READ);
4774 return 0;
4775 }
4776 #endif
4777 }
4778
4779 /*
4780 * We are now in normal data flow (see the step list in the RFC)
4781 * Note most of these are inline now. I'll inline the lot when
4782 * I have time to test it hard and look at what gcc outputs
4783 */
4784
4785 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
4786 {
4787 kfree_skb(skb, FREE_READ);
4788 release_sock(sk);
4789 return 0;
4790 }
4791
4792 if(th->rst)
4793 return tcp_std_reset(sk,skb);
4794
4795 /*
4796 * !syn_ok is effectively the state test in RFC793.
4797 */
4798
4799 if(th->syn && !syn_ok)
4800 {
4801 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
4802 return tcp_std_reset(sk,skb);
4803 }
4804
4805 /*
4806 * Process the ACK
4807 */
4808
4809
4810 if(th->ack && !tcp_ack(sk,th,saddr,len))
4811 {
4812 /*
4813 * Our three way handshake failed.
4814 */
4815
4816 if(sk->state==TCP_SYN_RECV)
4817 {
4818 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
4819 }
4820 kfree_skb(skb, FREE_READ);
4821 release_sock(sk);
4822 return 0;
4823 }
4824
4825 rfc_step6: /* I'll clean this up later */
4826
4827 /*
4828 * Process urgent data
4829 */
4830
4831 if(tcp_urg(sk, th, saddr, len))
4832 {
4833 kfree_skb(skb, FREE_READ);
4834 release_sock(sk);
4835 return 0;
4836 }
4837
4838
4839 /*
4840 * Process the encapsulated data
4841 */
4842
4843 if(tcp_data(skb,sk, saddr, len))
4844 {
4845 kfree_skb(skb, FREE_READ);
4846 release_sock(sk);
4847 return 0;
4848 }
4849
4850 /*
4851 * And done
4852 */
4853
4854 release_sock(sk);
4855 return 0;
4856 }
4857
4858 /*
4859 * This routine sends a packet with an out of date sequence
4860 * number. It assumes the other end will try to ack it.
4861 */
4862
4863 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4864 {
4865 struct sk_buff *buff,*skb;
4866 struct tcphdr *t1;
4867 struct device *dev=NULL;
4868 int tmp;
4869
4870 if (sk->zapped)
4871 return; /* After a valid reset we can send no more */
4872
4873 /*
4874 * Write data can still be transmitted/retransmitted in the
4875 * following states. If any other state is encountered, return.
4876 * [listen/close will never occur here anyway]
4877 */
4878
4879 if (sk->state != TCP_ESTABLISHED &&
4880 sk->state != TCP_CLOSE_WAIT &&
4881 sk->state != TCP_FIN_WAIT1 &&
4882 sk->state != TCP_LAST_ACK &&
4883 sk->state != TCP_CLOSING
4884 )
4885 {
4886 return;
4887 }
4888
4889 if (before(sk->sent_seq, sk->window_seq) &&
4890 (skb=skb_peek(&sk->write_queue)))
4891 {
4892 /*
4893 * We are probing the opening of a window
4894 * but the window size is != 0
4895 * must have been a result SWS advoidance ( sender )
4896 */
4897
4898 struct iphdr *iph;
4899 struct tcphdr *th;
4900 struct tcphdr *nth;
4901 unsigned long win_size, ow_size;
4902 void * tcp_data_start;
4903
4904 win_size = sk->window_seq - sk->sent_seq;
4905
4906 iph = (struct iphdr *)(skb->data + skb->dev->hard_header_len);
4907 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
4908
4909 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
4910 (iph->ihl << 2) +
4911 skb->dev->hard_header_len,
4912 1, GFP_ATOMIC);
4913 if ( buff == NULL )
4914 return;
4915
4916 buff->len = 0;
4917
4918 /*
4919 * If we strip the packet on the write queue we must
4920 * be ready to retransmit this one
4921 */
4922
4923 buff->free = 0;
4924
4925 buff->sk = sk;
4926 buff->localroute = sk->localroute;
4927
4928 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4929 IPPROTO_TCP, sk->opt, buff->mem_len,
4930 sk->ip_tos,sk->ip_ttl);
4931 if (tmp < 0)
4932 {
4933 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
4934 return;
4935 }
4936
4937 buff->len += tmp;
4938 buff->dev = dev;
4939
4940 nth = (struct tcphdr *) (buff->data + buff->len);
4941 buff->len += th->doff * 4;
4942
4943 memcpy(nth, th, th->doff * 4);
4944
4945 nth->ack = 1;
4946 nth->ack_seq = ntohl(sk->acked_seq);
4947 nth->window = ntohs(tcp_select_window(sk));
4948 nth->check = 0;
4949
4950 tcp_data_start = skb->data + skb->dev->hard_header_len +
4951 (iph->ihl << 2) + th->doff * 4;
4952
4953 memcpy(buff->data + buff->len, tcp_data_start, win_size);
4954 buff->len += win_size;
4955 buff->h.seq = sk->sent_seq + win_size;
4956
4957 /*
4958 * now: shrink the queue head segment
4959 */
4960
4961 th->check = 0;
4962 ow_size = skb->len - win_size -
4963 ((unsigned long) (tcp_data_start - (void *) skb->data));
4964
4965 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
4966 skb->len -= win_size;
4967 sk->sent_seq += win_size;
4968 th->seq = htonl(sk->sent_seq);
4969
4970 if (th->urg)
4971 {
4972 unsigned short urg_ptr;
4973
4974 urg_ptr = ntohs(th->urg_ptr);
4975 if (urg_ptr <= win_size)
4976 th->urg = 0;
4977 else
4978 {
4979 urg_ptr -= win_size;
4980 th->urg_ptr = htons(urg_ptr);
4981 nth->urg_ptr = htons(win_size);
4982 }
4983 }
4984
4985 tcp_send_check(nth, sk->saddr, sk->daddr,
4986 nth->doff * 4 + win_size , sk);
4987 }
4988 else
4989 {
4990 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
4991 if (buff == NULL)
4992 return;
4993
4994 buff->len = sizeof(struct tcphdr);
4995 buff->free = 1;
4996 buff->sk = sk;
4997 buff->localroute = sk->localroute;
4998
4999 t1 = (struct tcphdr *) buff->data;
5000
5001 /*
5002 * Put in the IP header and routing stuff.
5003 */
5004
5005 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5006 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5007 if (tmp < 0)
5008 {
5009 sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);
5010 return;
5011 }
5012
5013 buff->len += tmp;
5014 t1 = (struct tcphdr *)((char *)t1 +tmp);
5015
5016 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5017
5018 /*
5019 * Use a previous sequence.
5020 * This should cause the other end to send an ack.
5021 */
5022
5023 t1->seq = htonl(sk->sent_seq-1);
5024 t1->ack = 1;
5025 t1->res1= 0;
5026 t1->res2= 0;
5027 t1->rst = 0;
5028 t1->urg = 0;
5029 t1->psh = 0;
5030 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5031 t1->syn = 0;
5032 t1->ack_seq = ntohl(sk->acked_seq);
5033 t1->window = ntohs(tcp_select_window(sk));
5034 t1->doff = sizeof(*t1)/4;
5035 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5036
5037 }
5038
5039 /*
5040 * Send it.
5041 */
5042
5043 sk->prot->queue_xmit(sk, dev, buff, 1);
5044 tcp_statistics.TcpOutSegs++;
5045 }
5046
5047 /*
5048 * A window probe timeout has occurred.
5049 */
5050
5051 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5052 {
5053 if (sk->zapped)
5054 return; /* After a valid reset we can send no more */
5055
5056 tcp_write_wakeup(sk);
5057
5058 sk->backoff++;
5059 sk->rto = min(sk->rto << 1, 120*HZ);
5060 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5061 sk->retransmits++;
5062 sk->prot->retransmits ++;
5063 }
5064
5065 /*
5066 * Socket option code for TCP.
5067 */
5068
5069 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5070 {
5071 int val,err;
5072
5073 if(level!=SOL_TCP)
5074 return ip_setsockopt(sk,level,optname,optval,optlen);
5075
5076 if (optval == NULL)
5077 return(-EINVAL);
5078
5079 err=verify_area(VERIFY_READ, optval, sizeof(int));
5080 if(err)
5081 return err;
5082
5083 val = get_fs_long((unsigned long *)optval);
5084
5085 switch(optname)
5086 {
5087 case TCP_MAXSEG:
5088 /*
5089 * values greater than interface MTU won't take effect. however at
5090 * the point when this call is done we typically don't yet know
5091 * which interface is going to be used
5092 */
5093 if(val<1||val>MAX_WINDOW)
5094 return -EINVAL;
5095 sk->user_mss=val;
5096 return 0;
5097 case TCP_NODELAY:
5098 sk->nonagle=(val==0)?0:1;
5099 return 0;
5100 default:
5101 return(-ENOPROTOOPT);
5102 }
5103 }
5104
5105 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5106 {
5107 int val,err;
5108
5109 if(level!=SOL_TCP)
5110 return ip_getsockopt(sk,level,optname,optval,optlen);
5111
5112 switch(optname)
5113 {
5114 case TCP_MAXSEG:
5115 val=sk->user_mss;
5116 break;
5117 case TCP_NODELAY:
5118 val=sk->nonagle;
5119 break;
5120 default:
5121 return(-ENOPROTOOPT);
5122 }
5123 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5124 if(err)
5125 return err;
5126 put_fs_long(sizeof(int),(unsigned long *) optlen);
5127
5128 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5129 if(err)
5130 return err;
5131 put_fs_long(val,(unsigned long *)optval);
5132
5133 return(0);
5134 }
5135
5136
5137 struct proto tcp_prot = {
5138 sock_wmalloc,
5139 sock_rmalloc,
5140 sock_wfree,
5141 sock_rfree,
5142 sock_rspace,
5143 sock_wspace,
5144 tcp_close,
5145 tcp_read,
5146 tcp_write,
5147 tcp_sendto,
5148 tcp_recvfrom,
5149 ip_build_header,
5150 tcp_connect,
5151 tcp_accept,
5152 ip_queue_xmit,
5153 tcp_retransmit,
5154 tcp_write_wakeup,
5155 tcp_read_wakeup,
5156 tcp_rcv,
5157 tcp_select,
5158 tcp_ioctl,
5159 NULL,
5160 tcp_shutdown,
5161 tcp_setsockopt,
5162 tcp_getsockopt,
5163 128,
5164 0,
5165 "TCP",
5166 0, 0,
5167 {NULL,}
5168 };