1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. select
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle select() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), select() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in selecting before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : Select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if stat is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications
178 *
179 *
180 * To Fix:
181 * Fast path the code. Two things here - fix the window calculation
182 * so it doesn't iterate over the queue, also spot packets with no funny
183 * options arriving in order and process directly.
184 *
185 * Implement RFC 1191 [Path MTU discovery]
186 * Look at the effect of implementing RFC 1337 suggestions and their impact.
187 * Rewrite output state machine to use a single queue and do low window
188 * situations as per the spec (RFC 1122)
189 * Speed up input assembly algorithm.
190 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
191 * could do with it working on IPv4
192 * User settable/learned rtt/max window/mtu
193 * Cope with MTU/device switches when retransmitting in tcp.
194 * Fix the window handling to use PR's new code.
195 *
196 * Change the fundamental structure to a single send queue maintained
197 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
198 * active routes too]). Cut the queue off in tcp_retransmit/
199 * tcp_transmit.
200 * Change the receive queue to assemble as it goes. This lets us
201 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
202 * tcp_data/tcp_read as well as the window shrink crud.
203 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
204 * tcp_queue_skb seem obvious routines to extract.
205 *
206 * This program is free software; you can redistribute it and/or
207 * modify it under the terms of the GNU General Public License
208 * as published by the Free Software Foundation; either version
209 * 2 of the License, or(at your option) any later version.
210 *
211 * Description of States:
212 *
213 * TCP_SYN_SENT sent a connection request, waiting for ack
214 *
215 * TCP_SYN_RECV received a connection request, sent ack,
216 * waiting for final ack in three-way handshake.
217 *
218 * TCP_ESTABLISHED connection established
219 *
220 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
221 * transmission of remaining buffered data
222 *
223 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
224 * to shutdown
225 *
226 * TCP_CLOSING both sides have shutdown but we still have
227 * data we have to finish sending
228 *
229 * TCP_TIME_WAIT timeout to catch resent junk before entering
230 * closed, can only be entered from FIN_WAIT2
231 * or CLOSING. Required because the other end
232 * may not have gotten our last ACK causing it
233 * to retransmit the data packet (which we ignore)
234 *
235 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
236 * us to finish writing our data and to shutdown
237 * (we have to close() to move on to LAST_ACK)
238 *
239 * TCP_LAST_ACK out side has shutdown after remote has
240 * shutdown. There may still be data in our
241 * buffer that we have to finish sending
242 *
243 * TCP_CLOSE socket is finished
244 */
245
246 /*
247 * RFC1122 status:
248 * NOTE: I'm not going to be doing comments in the code for this one except
249 * for violations and the like. tcp.c is just too big... If I say something
250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
251 * with Alan. -- MS 950903
252 *
253 * Use of PSH (4.2.2.2)
254 * MAY aggregate data sent without the PSH flag. (does)
255 * MAY queue data recieved without the PSH flag. (does)
256 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
257 * MAY implement PSH on send calls. (doesn't, thus:)
258 * MUST NOT buffer data indefinitely (doesn't [1 second])
259 * MUST set PSH on last segment (does)
260 * MAY pass received PSH to application layer (doesn't)
261 * SHOULD send maximum-sized segment whenever possible. (almost always does)
262 *
263 * Window Size (4.2.2.3, 4.2.2.16)
264 * MUST treat window size as an unsigned number (does)
265 * SHOULD treat window size as a 32-bit number (does not)
266 * MUST NOT shrink window once it is offered (does not normally)
267 *
268 * Urgent Pointer (4.2.2.4)
269 * **MUST point urgent pointer to last byte of urgent data (not right
270 * after). (doesn't, to be like BSD)
271 * MUST inform application layer asynchronously of incoming urgent
272 * data. (does)
273 * MUST provide application with means of determining the amount of
274 * urgent data pending. (does)
275 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
276 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
277 * [Follows BSD 1 byte of urgent data]
278 *
279 * TCP Options (4.2.2.5)
280 * MUST be able to recieve TCP options in any segment. (does)
281 * MUST ignore unsupported options (does)
282 *
283 * Maximum Segment Size Option (4.2.2.6)
284 * MUST implement both sending and receiving MSS. (does)
285 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
286 * it always). (does, even when MSS == 536, which is legal)
287 * MUST assume MSS == 536 if no MSS received at connection setup (does)
288 * MUST calculate "effective send MSS" correctly:
289 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
290 * (does - but allows operator override)
291 *
292 * TCP Checksum (4.2.2.7)
293 * MUST generate and check TCP checksum. (does)
294 *
295 * Initial Sequence Number Selection (4.2.2.8)
296 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
297 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
298 * necessary for 10Mbps networks - and harder than BSD to spoof!)
299 *
300 * Simultaneous Open Attempts (4.2.2.10)
301 * MUST support simultaneous open attempts (does)
302 *
303 * Recovery from Old Duplicate SYN (4.2.2.11)
304 * MUST keep track of active vs. passive open (does)
305 *
306 * RST segment (4.2.2.12)
307 * SHOULD allow an RST segment to contain data (does, but doesn't do
308 * anything with it, which is standard)
309 *
310 * Closing a Connection (4.2.2.13)
311 * MUST inform application of whether connectin was closed by RST or
312 * normal close. (does)
313 * MAY allow "half-duplex" close (treat connection as closed for the
314 * local app, even before handshake is done). (does)
315 * MUST linger in TIME_WAIT for 2 * MSL (does)
316 *
317 * Retransmission Timeout (4.2.2.15)
318 * MUST implement Jacobson's slow start and congestion avoidance
319 * stuff. (does)
320 *
321 * Probing Zero Windows (4.2.2.17)
322 * MUST support probing of zero windows. (does)
323 * MAY keep offered window closed indefinitely. (does)
324 * MUST allow remote window to stay closed indefinitely. (does)
325 *
326 * Passive Open Calls (4.2.2.18)
327 * MUST NOT let new passive open affect other connections. (doesn't)
328 * MUST support passive opens (LISTENs) concurrently. (does)
329 *
330 * Time to Live (4.2.2.19)
331 * MUST make TCP TTL configurable. (does - IP_TTL option)
332 *
333 * Event Processing (4.2.2.20)
334 * SHOULD queue out-of-order segments. (does)
335 * MUST aggregate ACK segments whenever possible. (does but badly)
336 *
337 * Retransmission Timeout Calculation (4.2.3.1)
338 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
339 * calculation. (does, or at least explains them in the comments 8*b)
340 * SHOULD initialize RTO to 0 and RTT to 3. (does)
341 *
342 * When to Send an ACK Segment (4.2.3.2)
343 * SHOULD implement delayed ACK. (does not)
344 * MUST keep ACK delay < 0.5 sec. (N/A)
345 *
346 * When to Send a Window Update (4.2.3.3)
347 * MUST implement receiver-side SWS. (does)
348 *
349 * When to Send Data (4.2.3.4)
350 * MUST implement sender-side SWS. (does - imperfectly)
351 * SHOULD implement Nagle algorithm. (does)
352 *
353 * TCP Connection Failures (4.2.3.5)
354 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
355 * SHOULD inform application layer of soft errors. (doesn't)
356 *
357 * TCP Keep-Alives (4.2.3.6)
358 * MAY provide keep-alives. (does)
359 * MUST make keep-alives configurable on a per-connection basis. (does)
360 * MUST default to no keep-alives. (does)
361 * **MUST make keep-alive interval configurable. (doesn't)
362 * **MUST make default keep-alive interval > 2 hours. (doesn't)
363 * MUST NOT interpret failure to ACK keep-alive packet as dead
364 * connection. (doesn't)
365 * SHOULD send keep-alive with no data. (does)
366 *
367 * TCP Multihoming (4.2.3.7)
368 * MUST get source address from IP layer before sending first
369 * SYN. (does)
370 * MUST use same local address for all segments of a connection. (does)
371 *
372 * IP Options (4.2.3.8)
373 * (I don't think the IP layer sees the IP options, yet.)
374 * MUST ignore unsupported IP options. (does, I guess 8*b)
375 * MAY support Time Stamp and Record Route. (doesn't)
376 * **MUST allow application to specify a source route. (doesn't?)
377 * **MUST allow receieved Source Route option to set route for all future
378 * segments on this connection. (doesn't, not that I think it's a
379 * huge problem)
380 *
381 * ICMP messages (4.2.3.9)
382 * MUST act on ICMP errors. (does)
383 * MUST slow transmission upon receipt of a Source Quench. (does)
384 * MUST NOT abort connection upon receipt of soft Destination
385 * Unreachables (0, 1, 5), Time Exceededs and Parameter
386 * Problems. (doesn't)
387 * SHOULD report soft Destination Unreachables etc. to the
388 * application. (doesn't)
389 * SHOULD abort connection upon receipt of hard Destination Unreachable
390 * messages (2, 3, 4). (does)
391 *
392 * Remote Address Validation (4.2.3.10)
393 * MUST reject as an error OPEN for invalid remote IP address. (does)
394 * MUST ignore SYN with invalid source address. (does)
395 * MUST silently discard incoming SYN for broadcast/multicast
396 * address. (does)
397 *
398 * Asynchronous Reports (4.2.4.1)
399 * **MUST provide mechanism for reporting soft errors to application
400 * layer. (doesn't)
401 *
402 * Type of Service (4.2.4.2)
403 * MUST allow application layer to set Type of Service. (does IP_TOS)
404 *
405 * (Whew. -- MS 950903)
406 **/
407
408 #include <linux/types.h>
409 #include <linux/sched.h>
410 #include <linux/mm.h>
411 #include <linux/time.h>
412 #include <linux/string.h>
413 #include <linux/config.h>
414 #include <linux/socket.h>
415 #include <linux/sockios.h>
416 #include <linux/termios.h>
417 #include <linux/in.h>
418 #include <linux/fcntl.h>
419 #include <linux/inet.h>
420 #include <linux/netdevice.h>
421 #include <net/snmp.h>
422 #include <net/ip.h>
423 #include <net/protocol.h>
424 #include <net/icmp.h>
425 #include <net/tcp.h>
426 #include <net/arp.h>
427 #include <linux/skbuff.h>
428 #include <net/sock.h>
429 #include <net/route.h>
430 #include <linux/errno.h>
431 #include <linux/timer.h>
432 #include <asm/system.h>
433 #include <asm/segment.h>
434 #include <linux/mm.h>
435 #include <net/checksum.h>
436
437 /*
438 * The MSL timer is the 'normal' timer.
439 */
440
441 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
442
443 #define SEQ_TICK 3
444 unsigned long seq_offset;
445 struct tcp_mib tcp_statistics;
446
447 /*
448 * Cached last hit socket
449 */
450
451 volatile unsigned long th_cache_saddr,th_cache_daddr;
452 volatile unsigned short th_cache_dport, th_cache_sport;
453 volatile struct sock *th_cache_sk;
454
455 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
456 {
457 unsigned long flags;
458 save_flags(flags);
459 cli();
460 th_cache_saddr=0;
461 th_cache_daddr=0;
462 th_cache_dport=0;
463 th_cache_sport=0;
464 th_cache_sk=NULL;
465 restore_flags(flags);
466 }
467
468 static void tcp_close(struct sock *sk, int timeout);
469
470
471 /*
472 * The less said about this the better, but it works and will do for 1.2
473 */
474
475 static struct wait_queue *master_select_wakeup;
476
477 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
478 {
479 if (a < b)
480 return(a);
481 return(b);
482 }
483
484 #undef STATE_TRACE
485
486 #ifdef STATE_TRACE
487 static char *statename[]={
488 "Unused","Established","Syn Sent","Syn Recv",
489 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
490 "Close Wait","Last ACK","Listen","Closing"
491 };
492 #endif
493
494 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
495 {
496 if(sk->state==TCP_ESTABLISHED)
497 tcp_statistics.TcpCurrEstab--;
498 #ifdef STATE_TRACE
499 if(sk->debug)
500 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
501 #endif
502 /* This is a hack but it doesn't occur often and it's going to
503 be a real to fix nicely */
504
505 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
506 {
507 wake_up_interruptible(&master_select_wakeup);
508 }
509 sk->state=state;
510 if(state==TCP_ESTABLISHED)
511 tcp_statistics.TcpCurrEstab++;
512 }
513
514 /*
515 * This routine picks a TCP windows for a socket based on
516 * the following constraints
517 *
518 * 1. The window can never be shrunk once it is offered (RFC 793)
519 * 2. We limit memory per socket
520 *
521 * For now we use NET2E3's heuristic of offering half the memory
522 * we have handy. All is not as bad as this seems however because
523 * of two things. Firstly we will bin packets even within the window
524 * in order to get the data we are waiting for into the memory limit.
525 * Secondly we bin common duplicate forms at receive time
526 * Better heuristics welcome
527 */
528
529 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
530 {
531 int new_window = sk->prot->rspace(sk);
532
533 if(sk->window_clamp)
534 new_window=min(sk->window_clamp,new_window);
535 /*
536 * Two things are going on here. First, we don't ever offer a
537 * window less than min(sk->mss, MAX_WINDOW/2). This is the
538 * receiver side of SWS as specified in RFC1122.
539 * Second, we always give them at least the window they
540 * had before, in order to avoid retracting window. This
541 * is technically allowed, but RFC1122 advises against it and
542 * in practice it causes trouble.
543 *
544 * Fixme: This doesn't correctly handle the case where
545 * new_window > sk->window but not by enough to allow for the
546 * shift in sequence space.
547 */
548 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
549 return(sk->window);
550 return(new_window);
551 }
552
553 /*
554 * Find someone to 'accept'. Must be called with
555 * sk->inuse=1 or cli()
556 */
557
558 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
559 {
560 struct sk_buff *p=skb_peek(&s->receive_queue);
561 if(p==NULL)
562 return NULL;
563 do
564 {
565 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
566 return p;
567 p=p->next;
568 }
569 while(p!=(struct sk_buff *)&s->receive_queue);
570 return NULL;
571 }
572
573 /*
574 * Remove a completed connection and return it. This is used by
575 * tcp_accept() to get connections from the queue.
576 */
577
578 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
579 {
580 struct sk_buff *skb;
581 unsigned long flags;
582 save_flags(flags);
583 cli();
584 skb=tcp_find_established(s);
585 if(skb!=NULL)
586 skb_unlink(skb); /* Take it off the queue */
587 restore_flags(flags);
588 return skb;
589 }
590
591 /*
592 * This routine closes sockets which have been at least partially
593 * opened, but not yet accepted. Currently it is only called by
594 * tcp_close, and timeout mirrors the value there.
595 */
596
597 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
598 {
599 struct sk_buff *skb;
600
601 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
602 {
603 skb->sk->dead=1;
604 tcp_close(skb->sk, 0);
605 kfree_skb(skb, FREE_READ);
606 }
607 return;
608 }
609
610 /*
611 * Enter the time wait state.
612 */
613
614 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
615 {
616 tcp_set_state(sk,TCP_TIME_WAIT);
617 sk->shutdown = SHUTDOWN_MASK;
618 if (!sk->dead)
619 sk->state_change(sk);
620 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
621 }
622
623 /*
624 * A socket has timed out on its send queue and wants to do a
625 * little retransmitting. Currently this means TCP.
626 */
627
628 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
629 {
630 struct sk_buff * skb;
631 struct proto *prot;
632 struct device *dev;
633 int ct=0;
634 struct rtable *rt;
635
636 prot = sk->prot;
637 skb = sk->send_head;
638
639 while (skb != NULL)
640 {
641 struct tcphdr *th;
642 struct iphdr *iph;
643 int size;
644
645 dev = skb->dev;
646 IS_SKB(skb);
647 skb->when = jiffies;
648
649 /*
650 * Discard the surplus MAC header
651 */
652
653 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
654
655 /*
656 * In general it's OK just to use the old packet. However we
657 * need to use the current ack and window fields. Urg and
658 * urg_ptr could possibly stand to be updated as well, but we
659 * don't keep the necessary data. That shouldn't be a problem,
660 * if the other end is doing the right thing. Since we're
661 * changing the packet, we have to issue a new IP identifier.
662 */
663
664 iph = (struct iphdr *)skb->data;
665 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
666 size = ntohs(iph->tot_len) - (iph->ihl<<2);
667
668 /*
669 * Note: We ought to check for window limits here but
670 * currently this is done (less efficiently) elsewhere.
671 */
672
673 iph->id = htons(ip_id_count++);
674 ip_send_check(iph);
675
676 /*
677 * Put a MAC header back on (may cause ARPing)
678 */
679
680 if(skb->localroute)
681 rt=ip_rt_local(iph->daddr,NULL,NULL);
682 else
683 rt=ip_rt_route(iph->daddr,NULL,NULL);
684
685 if(rt==NULL) /* Deep poo */
686 {
687 if(skb->sk)
688 {
689 skb->sk->err=ENETUNREACH;
690 skb->sk->error_report(skb->sk);
691 }
692 }
693 else
694 {
695 dev=rt->rt_dev;
696 skb->raddr=rt->rt_gateway;
697 if(skb->raddr==0)
698 skb->raddr=iph->daddr;
699 skb->dev=dev;
700 skb->arp=1;
701 if(dev->hard_header)
702 {
703 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
704 skb->arp=0;
705 }
706
707 /*
708 * This is not the right way to handle this. We have to
709 * issue an up to date window and ack report with this
710 * retransmit to keep the odd buggy tcp that relies on
711 * the fact BSD does this happy.
712 * We don't however need to recalculate the entire
713 * checksum, so someone wanting a small problem to play
714 * with might like to implement RFC1141/RFC1624 and speed
715 * this up by avoiding a full checksum.
716 */
717
718 th->ack_seq = ntohl(sk->acked_seq);
719 th->window = ntohs(tcp_select_window(sk));
720 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
721
722 /*
723 * If the interface is (still) up and running, kick it.
724 */
725
726 if (dev->flags & IFF_UP)
727 {
728 /*
729 * If the packet is still being sent by the device/protocol
730 * below then don't retransmit. This is both needed, and good -
731 * especially with connected mode AX.25 where it stops resends
732 * occurring of an as yet unsent anyway frame!
733 * We still add up the counts as the round trip time wants
734 * adjusting.
735 */
736 if (sk && !skb_device_locked(skb))
737 {
738 /* Remove it from any existing driver queue first! */
739 skb_unlink(skb);
740 /* Now queue it */
741 ip_statistics.IpOutRequests++;
742 dev_queue_xmit(skb, dev, sk->priority);
743 }
744 }
745 }
746
747 /*
748 * Count retransmissions
749 */
750
751 ct++;
752 sk->prot->retransmits ++;
753 tcp_statistics.TcpRetransSegs++;
754
755
756 /*
757 * Only one retransmit requested.
758 */
759
760 if (!all)
761 break;
762
763 /*
764 * This should cut it off before we send too many packets.
765 */
766
767 if (ct >= sk->cong_window)
768 break;
769 skb = skb->link3;
770 }
771 }
772
773 /*
774 * Reset the retransmission timer
775 */
776
777 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
778 {
779 del_timer(&sk->retransmit_timer);
780 sk->ip_xmit_timeout = why;
781 if((int)when < 0)
782 {
783 when=3;
784 printk("Error: Negative timer in xmit_timer\n");
785 }
786 sk->retransmit_timer.expires=jiffies+when;
787 add_timer(&sk->retransmit_timer);
788 }
789
790 /*
791 * This is the normal code called for timeouts. It does the retransmission
792 * and then does backoff. tcp_do_retransmit is separated out because
793 * tcp_ack needs to send stuff from the retransmit queue without
794 * initiating a backoff.
795 */
796
797
798 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
799 {
800 tcp_do_retransmit(sk, all);
801
802 /*
803 * Increase the timeout each time we retransmit. Note that
804 * we do not increase the rtt estimate. rto is initialized
805 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
806 * that doubling rto each time is the least we can get away with.
807 * In KA9Q, Karn uses this for the first few times, and then
808 * goes to quadratic. netBSD doubles, but only goes up to *64,
809 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
810 * defined in the protocol as the maximum possible RTT. I guess
811 * we'll have to use something other than TCP to talk to the
812 * University of Mars.
813 *
814 * PAWS allows us longer timeouts and large windows, so once
815 * implemented ftp to mars will work nicely. We will have to fix
816 * the 120 second clamps though!
817 */
818
819 sk->retransmits++;
820 sk->prot->retransmits++;
821 sk->backoff++;
822 sk->rto = min(sk->rto << 1, 120*HZ);
823 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
824 }
825
826
827 /*
828 * A timer event has trigger a tcp retransmit timeout. The
829 * socket xmit queue is ready and set up to send. Because
830 * the ack receive code keeps the queue straight we do
831 * nothing clever here.
832 */
833
834 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
835 {
836 if (all)
837 {
838 tcp_retransmit_time(sk, all);
839 return;
840 }
841
842 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
843 /* sk->ssthresh in theory can be zero. I guess that's OK */
844 sk->cong_count = 0;
845
846 sk->cong_window = 1;
847
848 /* Do the actual retransmit. */
849 tcp_retransmit_time(sk, all);
850 }
851
852 /*
853 * A write timeout has occurred. Process the after effects.
854 */
855
856 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
857 {
858 /*
859 * Look for a 'soft' timeout.
860 */
861 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
862 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
863 {
864 /*
865 * Attempt to recover if arp has changed (unlikely!) or
866 * a route has shifted (not supported prior to 1.3).
867 */
868 arp_destroy (sk->daddr, 0);
869 /*ip_route_check (sk->daddr);*/
870 }
871
872 /*
873 * Have we tried to SYN too many times (repent repent 8))
874 */
875
876 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
877 {
878 sk->err=ETIMEDOUT;
879 sk->error_report(sk);
880 del_timer(&sk->retransmit_timer);
881 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
882 tcp_set_state(sk,TCP_CLOSE);
883 /* Don't FIN, we got nothing back */
884 release_sock(sk);
885 return 0;
886 }
887 /*
888 * Has it gone just too far ?
889 */
890 if (sk->retransmits > TCP_RETR2)
891 {
892 sk->err = ETIMEDOUT;
893 sk->error_report(sk);
894 del_timer(&sk->retransmit_timer);
895 /*
896 * Time wait the socket
897 */
898 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
899 {
900 tcp_set_state(sk,TCP_TIME_WAIT);
901 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
902 }
903 else
904 {
905 /*
906 * Clean up time.
907 */
908 tcp_set_state(sk, TCP_CLOSE);
909 release_sock(sk);
910 return 0;
911 }
912 }
913 return 1;
914 }
915
916 /*
917 * The TCP retransmit timer. This lacks a few small details.
918 *
919 * 1. An initial rtt timeout on the probe0 should cause what we can
920 * of the first write queue buffer to be split and sent.
921 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
922 * ETIMEDOUT if we know an additional 'soft' error caused this.
923 * tcp_err should save a 'soft error' for us.
924 */
925
926 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
927 {
928 struct sock *sk = (struct sock*)data;
929 int why = sk->ip_xmit_timeout;
930
931 /*
932 * only process if socket is not in use
933 */
934
935 cli();
936 if (sk->inuse || in_bh)
937 {
938 /* Try again in 1 second */
939 sk->retransmit_timer.expires = jiffies+HZ;
940 add_timer(&sk->retransmit_timer);
941 sti();
942 return;
943 }
944
945 sk->inuse = 1;
946 sti();
947
948 /* Always see if we need to send an ack. */
949
950 if (sk->ack_backlog && !sk->zapped)
951 {
952 sk->prot->read_wakeup (sk);
953 if (! sk->dead)
954 sk->data_ready(sk,0);
955 }
956
957 /* Now we need to figure out why the socket was on the timer. */
958
959 switch (why)
960 {
961 /* Window probing */
962 case TIME_PROBE0:
963 tcp_send_probe0(sk);
964 tcp_write_timeout(sk);
965 break;
966 /* Retransmitting */
967 case TIME_WRITE:
968 /* It could be we got here because we needed to send an ack.
969 * So we need to check for that.
970 */
971 {
972 struct sk_buff *skb;
973 unsigned long flags;
974
975 save_flags(flags);
976 cli();
977 skb = sk->send_head;
978 if (!skb)
979 {
980 restore_flags(flags);
981 }
982 else
983 {
984 /*
985 * Kicked by a delayed ack. Reset timer
986 * correctly now
987 */
988 if (jiffies < skb->when + sk->rto)
989 {
990 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
991 restore_flags(flags);
992 break;
993 }
994 restore_flags(flags);
995 /*
996 * Retransmission
997 */
998 sk->retransmits++;
999 sk->prot->retransmits++;
1000 sk->prot->retransmit (sk, 0);
1001 tcp_write_timeout(sk);
1002 }
1003 break;
1004 }
1005 /* Sending Keepalives */
1006 case TIME_KEEPOPEN:
1007 /*
1008 * this reset_timer() call is a hack, this is not
1009 * how KEEPOPEN is supposed to work.
1010 */
1011 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1012
1013 /* Send something to keep the connection open. */
1014 if (sk->prot->write_wakeup)
1015 sk->prot->write_wakeup (sk);
1016 sk->retransmits++;
1017 sk->prot->retransmits++;
1018 tcp_write_timeout(sk);
1019 break;
1020 default:
1021 printk ("rexmit_timer: timer expired - reason unknown\n");
1022 break;
1023 }
1024 release_sock(sk);
1025 }
1026
1027 /*
1028 * This routine is called by the ICMP module when it gets some
1029 * sort of error condition. If err < 0 then the socket should
1030 * be closed and the error returned to the user. If err > 0
1031 * it's just the icmp type << 8 | icmp code. After adjustment
1032 * header points to the first 8 bytes of the tcp header. We need
1033 * to find the appropriate port.
1034 */
1035
1036 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1037 __u32 saddr, struct inet_protocol *protocol)
1038 {
1039 struct tcphdr *th;
1040 struct sock *sk;
1041 struct iphdr *iph=(struct iphdr *)header;
1042
1043 header+=4*iph->ihl;
1044
1045
1046 th =(struct tcphdr *)header;
1047 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1048
1049 if (sk == NULL)
1050 return;
1051
1052 if (type == ICMP_SOURCE_QUENCH)
1053 {
1054 /*
1055 * FIXME:
1056 * For now we will just trigger a linear backoff.
1057 * The slow start code should cause a real backoff here.
1058 */
1059 if (sk->cong_window > 4)
1060 sk->cong_window--;
1061 return;
1062 }
1063
1064 if (type == ICMP_PARAMETERPROB)
1065 {
1066 sk->err=EPROTO;
1067 sk->error_report(sk);
1068 }
1069
1070 /*
1071 * If we've already connected we will keep trying
1072 * until we time out, or the user gives up.
1073 */
1074
1075 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1076 {
1077 sk->err = icmp_err_convert[code].errno;
1078 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1079 {
1080 tcp_statistics.TcpAttemptFails++;
1081 tcp_set_state(sk,TCP_CLOSE);
1082 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
1083 }
1084 }
1085 return;
1086 }
1087
1088
1089 /*
1090 * Walk down the receive queue counting readable data until we hit the end or we find a gap
1091 * in the received data queue (ie a frame missing that needs sending to us). Not
1092 * sorting using two queues as data arrives makes life so much harder.
1093 */
1094
1095 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1096 {
1097 unsigned long counted;
1098 unsigned long amount;
1099 struct sk_buff *skb;
1100 int sum;
1101 unsigned long flags;
1102
1103 if(sk && sk->debug)
1104 printk("tcp_readable: %p - ",sk);
1105
1106 save_flags(flags);
1107 cli();
1108 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1109 {
1110 restore_flags(flags);
1111 if(sk && sk->debug)
1112 printk("empty\n");
1113 return(0);
1114 }
1115
1116 counted = sk->copied_seq; /* Where we are at the moment */
1117 amount = 0;
1118
1119 /*
1120 * Do until a push or until we are out of data.
1121 */
1122
1123 do
1124 {
1125 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
1126 break;
1127 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
1128 if (skb->h.th->syn)
1129 sum++;
1130 if (sum > 0)
1131 { /* Add it up, move on */
1132 amount += sum;
1133 if (skb->h.th->syn)
1134 amount--;
1135 counted += sum;
1136 }
1137 /*
1138 * Don't count urg data ... but do it in the right place!
1139 * Consider: "old_data (ptr is here) URG PUSH data"
1140 * The old code would stop at the first push because
1141 * it counted the urg (amount==1) and then does amount--
1142 * *after* the loop. This means tcp_readable() always
1143 * returned zero if any URG PUSH was in the queue, even
1144 * though there was normal data available. If we subtract
1145 * the urg data right here, we even get it to work for more
1146 * than one URG PUSH skb without normal data.
1147 * This means that select() finally works now with urg data
1148 * in the queue. Note that rlogin was never affected
1149 * because it doesn't use select(); it uses two processes
1150 * and a blocking read(). And the queue scan in tcp_read()
1151 * was correct. Mike <pall@rz.uni-karlsruhe.de>
1152 */
1153 if (skb->h.th->urg)
1154 amount--; /* don't count urg data */
1155 if (amount && skb->h.th->psh) break;
1156 skb = skb->next;
1157 }
1158 while(skb != (struct sk_buff *)&sk->receive_queue);
1159
1160 restore_flags(flags);
1161 if(sk->debug)
1162 printk("got %lu bytes.\n",amount);
1163 return(amount);
1164 }
1165
1166 /*
1167 * LISTEN is a special case for select..
1168 */
1169 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1170 {
1171 if (sel_type == SEL_IN) {
1172 int retval;
1173
1174 sk->inuse = 1;
1175 retval = (tcp_find_established(sk) != NULL);
1176 release_sock(sk);
1177 if (!retval)
1178 select_wait(&master_select_wakeup,wait);
1179 return retval;
1180 }
1181 return 0;
1182 }
1183
1184
1185 /*
1186 * Wait for a TCP event.
1187 *
1188 * Note that we don't need to set "sk->inuse", as the upper select layers
1189 * take care of normal races (between the test and the event) and we don't
1190 * go look at any of the socket buffers directly.
1191 */
1192 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1193 {
1194 if (sk->state == TCP_LISTEN)
1195 return tcp_listen_select(sk, sel_type, wait);
1196
1197 switch(sel_type) {
1198 case SEL_IN:
1199 if (sk->err)
1200 return 1;
1201 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1202 break;
1203
1204 if (sk->shutdown & RCV_SHUTDOWN)
1205 return 1;
1206
1207 if (sk->acked_seq == sk->copied_seq)
1208 break;
1209
1210 if (sk->urg_seq != sk->copied_seq ||
1211 sk->acked_seq != sk->copied_seq+1 ||
1212 sk->urginline || !sk->urg_data)
1213 return 1;
1214 break;
1215
1216 case SEL_OUT:
1217 if (sk->err)
1218 return 1;
1219 if (sk->shutdown & SEND_SHUTDOWN)
1220 return 0;
1221 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1222 break;
1223 /*
1224 * This is now right thanks to a small fix
1225 * by Matt Dillon.
1226 */
1227
1228 if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header)
1229 break;
1230 return 1;
1231
1232 case SEL_EX:
1233 if (sk->urg_data)
1234 return 1;
1235 break;
1236 }
1237 select_wait(sk->sleep, wait);
1238 return 0;
1239 }
1240
1241 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1242 {
1243 int err;
1244 switch(cmd)
1245 {
1246
1247 case TIOCINQ:
1248 #ifdef FIXME /* FIXME: */
1249 case FIONREAD:
1250 #endif
1251 {
1252 unsigned long amount;
1253
1254 if (sk->state == TCP_LISTEN)
1255 return(-EINVAL);
1256
1257 sk->inuse = 1;
1258 amount = tcp_readable(sk);
1259 release_sock(sk);
1260 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1261 if(err)
1262 return err;
1263 put_user(amount, (int *)arg);
1264 return(0);
1265 }
1266 case SIOCATMARK:
1267 {
1268 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1269
1270 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1271 if (err)
1272 return err;
1273 put_user(answ,(int *) arg);
1274 return(0);
1275 }
1276 case TIOCOUTQ:
1277 {
1278 unsigned long amount;
1279
1280 if (sk->state == TCP_LISTEN) return(-EINVAL);
1281 amount = sk->prot->wspace(sk);
1282 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1283 if(err)
1284 return err;
1285 put_user(amount, (int *)arg);
1286 return(0);
1287 }
1288 default:
1289 return(-EINVAL);
1290 }
1291 }
1292
1293
1294 /*
1295 * This routine computes a TCP checksum.
1296 *
1297 * Modified January 1995 from a go-faster DOS routine by
1298 * Jorge Cwik <jorge@laser.satlink.net>
1299 */
1300
1301 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1302 unsigned long saddr, unsigned long daddr, unsigned long base)
1303 {
1304 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1305 }
1306
1307
1308
1309 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1310 unsigned long daddr, int len, struct sock *sk)
1311 {
1312 th->check = 0;
1313 th->check = tcp_check(th, len, saddr, daddr,
1314 csum_partial((char *)th,len,0));
1315 return;
1316 }
1317
1318 /*
1319 * This is the main buffer sending routine. We queue the buffer
1320 * having checked it is sane seeming.
1321 */
1322
1323 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1324 {
1325 int size;
1326 struct tcphdr * th = skb->h.th;
1327
1328 /*
1329 * length of packet (not counting length of pre-tcp headers)
1330 */
1331
1332 size = skb->len - ((unsigned char *) th - skb->data);
1333
1334 /*
1335 * Sanity check it..
1336 */
1337
1338 if (size < sizeof(struct tcphdr) || size > skb->len)
1339 {
1340 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1341 skb, skb->data, th, skb->len);
1342 kfree_skb(skb, FREE_WRITE);
1343 return;
1344 }
1345
1346 /*
1347 * If we have queued a header size packet.. (these crash a few
1348 * tcp stacks if ack is not set)
1349 */
1350
1351 if (size == sizeof(struct tcphdr))
1352 {
1353 /* If it's got a syn or fin it's notionally included in the size..*/
1354 if(!th->syn && !th->fin)
1355 {
1356 printk("tcp_send_skb: attempt to queue a bogon.\n");
1357 kfree_skb(skb,FREE_WRITE);
1358 return;
1359 }
1360 }
1361
1362 /*
1363 * Actual processing.
1364 */
1365
1366 tcp_statistics.TcpOutSegs++;
1367 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1368
1369 /*
1370 * We must queue if
1371 *
1372 * a) The right edge of this frame exceeds the window
1373 * b) We are retransmitting (Nagle's rule)
1374 * c) We have too many packets 'in flight'
1375 */
1376
1377 if (after(skb->h.seq, sk->window_seq) ||
1378 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1379 sk->packets_out >= sk->cong_window)
1380 {
1381 /* checksum will be supplied by tcp_write_xmit. So
1382 * we shouldn't need to set it at all. I'm being paranoid */
1383 th->check = 0;
1384 if (skb->next != NULL)
1385 {
1386 printk("tcp_send_partial: next != NULL\n");
1387 skb_unlink(skb);
1388 }
1389 skb_queue_tail(&sk->write_queue, skb);
1390
1391 /*
1392 * If we don't fit we have to start the zero window
1393 * probes. This is broken - we really need to do a partial
1394 * send _first_ (This is what causes the Cisco and PC/TCP
1395 * grief).
1396 */
1397
1398 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1399 sk->send_head == NULL && sk->ack_backlog == 0)
1400 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1401 }
1402 else
1403 {
1404 /*
1405 * This is going straight out
1406 */
1407
1408 th->ack_seq = ntohl(sk->acked_seq);
1409 th->window = ntohs(tcp_select_window(sk));
1410
1411 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1412
1413 sk->sent_seq = sk->write_seq;
1414
1415 /*
1416 * This is mad. The tcp retransmit queue is put together
1417 * by the ip layer. This causes half the problems with
1418 * unroutable FIN's and other things.
1419 */
1420
1421 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1422
1423 /*
1424 * Set for next retransmit based on expected ACK time.
1425 * FIXME: We set this every time which means our
1426 * retransmits are really about a window behind.
1427 */
1428
1429 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1430 }
1431 }
1432
1433 /*
1434 * Locking problems lead us to a messy situation where we can have
1435 * multiple partially complete buffers queued up. This is really bad
1436 * as we don't want to be sending partial buffers. Fix this with
1437 * a semaphore or similar to lock tcp_write per socket.
1438 *
1439 * These routines are pretty self descriptive.
1440 */
1441
1442 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1443 {
1444 struct sk_buff * skb;
1445 unsigned long flags;
1446
1447 save_flags(flags);
1448 cli();
1449 skb = sk->partial;
1450 if (skb) {
1451 sk->partial = NULL;
1452 del_timer(&sk->partial_timer);
1453 }
1454 restore_flags(flags);
1455 return skb;
1456 }
1457
1458 /*
1459 * Empty the partial queue
1460 */
1461
1462 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1463 {
1464 struct sk_buff *skb;
1465
1466 if (sk == NULL)
1467 return;
1468 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1469 tcp_send_skb(sk, skb);
1470 }
1471
1472 /*
1473 * Queue a partial frame
1474 */
1475
1476 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1477 {
1478 struct sk_buff * tmp;
1479 unsigned long flags;
1480
1481 save_flags(flags);
1482 cli();
1483 tmp = sk->partial;
1484 if (tmp)
1485 del_timer(&sk->partial_timer);
1486 sk->partial = skb;
1487 init_timer(&sk->partial_timer);
1488 /*
1489 * Wait up to 1 second for the buffer to fill.
1490 */
1491 sk->partial_timer.expires = jiffies+HZ;
1492 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1493 sk->partial_timer.data = (unsigned long) sk;
1494 add_timer(&sk->partial_timer);
1495 restore_flags(flags);
1496 if (tmp)
1497 tcp_send_skb(sk, tmp);
1498 }
1499
1500
1501 /*
1502 * This routine sends an ack and also updates the window.
1503 */
1504
1505 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1506 struct sock *sk,
1507 struct tcphdr *th, unsigned long daddr)
1508 {
1509 struct sk_buff *buff;
1510 struct tcphdr *t1;
1511 struct device *dev = NULL;
1512 int tmp;
1513
1514 if(sk->zapped)
1515 return; /* We have been reset, we may not send again */
1516
1517 /*
1518 * We need to grab some memory, and put together an ack,
1519 * and then put it into the queue to be sent.
1520 */
1521
1522 buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1523 if (buff == NULL)
1524 {
1525 /*
1526 * Force it to send an ack. We don't have to do this
1527 * (ACK is unreliable) but it's much better use of
1528 * bandwidth on slow links to send a spare ack than
1529 * resend packets.
1530 */
1531
1532 sk->ack_backlog++;
1533 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1534 {
1535 reset_xmit_timer(sk, TIME_WRITE, HZ);
1536 }
1537 return;
1538 }
1539
1540 /*
1541 * Assemble a suitable TCP frame
1542 */
1543
1544 buff->sk = sk;
1545 buff->localroute = sk->localroute;
1546
1547 /*
1548 * Put in the IP header and routing stuff.
1549 */
1550
1551 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1552 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1553 if (tmp < 0)
1554 {
1555 buff->free = 1;
1556 sk->prot->wfree(sk, buff);
1557 return;
1558 }
1559 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1560
1561 memcpy(t1, th, sizeof(*t1));
1562
1563 /*
1564 * Swap the send and the receive.
1565 */
1566
1567 t1->dest = th->source;
1568 t1->source = th->dest;
1569 t1->seq = ntohl(sequence);
1570 t1->ack = 1;
1571 sk->window = tcp_select_window(sk);
1572 t1->window = ntohs(sk->window);
1573 t1->res1 = 0;
1574 t1->res2 = 0;
1575 t1->rst = 0;
1576 t1->urg = 0;
1577 t1->syn = 0;
1578 t1->psh = 0;
1579 t1->fin = 0;
1580
1581 /*
1582 * If we have nothing queued for transmit and the transmit timer
1583 * is on we are just doing an ACK timeout and need to switch
1584 * to a keepalive.
1585 */
1586
1587 if (ack == sk->acked_seq)
1588 {
1589 sk->ack_backlog = 0;
1590 sk->bytes_rcv = 0;
1591 sk->ack_timed = 0;
1592 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1593 && sk->ip_xmit_timeout == TIME_WRITE)
1594 {
1595 if(sk->keepopen) {
1596 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1597 } else {
1598 delete_timer(sk);
1599 }
1600 }
1601 }
1602
1603 /*
1604 * Fill in the packet and send it
1605 */
1606
1607 t1->ack_seq = ntohl(ack);
1608 t1->doff = sizeof(*t1)/4;
1609 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1610 if (sk->debug)
1611 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1612 tcp_statistics.TcpOutSegs++;
1613 sk->prot->queue_xmit(sk, dev, buff, 1);
1614 }
1615
1616
1617 /*
1618 * This routine builds a generic TCP header.
1619 */
1620
1621 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1622 {
1623
1624 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1625 th->seq = htonl(sk->write_seq);
1626 th->psh =(push == 0) ? 1 : 0;
1627 th->doff = sizeof(*th)/4;
1628 th->ack = 1;
1629 th->fin = 0;
1630 sk->ack_backlog = 0;
1631 sk->bytes_rcv = 0;
1632 sk->ack_timed = 0;
1633 th->ack_seq = htonl(sk->acked_seq);
1634 sk->window = tcp_select_window(sk);
1635 th->window = htons(sk->window);
1636
1637 return(sizeof(*th));
1638 }
1639
1640 /*
1641 * This routine copies from a user buffer into a socket,
1642 * and starts the transmit system.
1643 */
1644
1645 static int tcp_write(struct sock *sk, const unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1646 int len, int nonblock, unsigned flags)
1647 {
1648 int copied = 0;
1649 int copy;
1650 int tmp;
1651 struct sk_buff *skb;
1652 struct sk_buff *send_tmp;
1653 struct proto *prot;
1654 struct device *dev = NULL;
1655
1656 sk->inuse=1;
1657 prot = sk->prot;
1658 while(len > 0)
1659 {
1660 if (sk->err)
1661 { /* Stop on an error */
1662 release_sock(sk);
1663 if (copied)
1664 return(copied);
1665 tmp = -sk->err;
1666 sk->err = 0;
1667 return(tmp);
1668 }
1669
1670 /*
1671 * First thing we do is make sure that we are established.
1672 */
1673
1674 if (sk->shutdown & SEND_SHUTDOWN)
1675 {
1676 release_sock(sk);
1677 sk->err = EPIPE;
1678 if (copied)
1679 return(copied);
1680 sk->err = 0;
1681 return(-EPIPE);
1682 }
1683
1684 /*
1685 * Wait for a connection to finish.
1686 */
1687
1688 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1689 {
1690 if (sk->err)
1691 {
1692 release_sock(sk);
1693 if (copied)
1694 return(copied);
1695 tmp = -sk->err;
1696 sk->err = 0;
1697 return(tmp);
1698 }
1699
1700 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1701 {
1702 release_sock(sk);
1703 if (copied)
1704 return(copied);
1705
1706 if (sk->err)
1707 {
1708 tmp = -sk->err;
1709 sk->err = 0;
1710 return(tmp);
1711 }
1712
1713 if (sk->keepopen)
1714 {
1715 send_sig(SIGPIPE, current, 0);
1716 }
1717 return(-EPIPE);
1718 }
1719
1720 if (nonblock || copied)
1721 {
1722 release_sock(sk);
1723 if (copied)
1724 return(copied);
1725 return(-EAGAIN);
1726 }
1727
1728 release_sock(sk);
1729 cli();
1730
1731 if (sk->state != TCP_ESTABLISHED &&
1732 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1733 {
1734 interruptible_sleep_on(sk->sleep);
1735 if (current->signal & ~current->blocked)
1736 {
1737 sti();
1738 if (copied)
1739 return(copied);
1740 return(-ERESTARTSYS);
1741 }
1742 }
1743 sk->inuse = 1;
1744 sti();
1745 }
1746
1747 /*
1748 * The following code can result in copy <= if sk->mss is ever
1749 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1750 * sk->mtu is constant once SYN processing is finished. I.e. we
1751 * had better not get here until we've seen his SYN and at least one
1752 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1753 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1754 * non-decreasing. Note that any ioctl to set user_mss must be done
1755 * before the exchange of SYN's. If the initial ack from the other
1756 * end has a window of 0, max_window and thus mss will both be 0.
1757 */
1758
1759 /*
1760 * Now we need to check if we have a half built packet.
1761 */
1762
1763 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1764 {
1765 int hdrlen;
1766
1767 /* IP header + TCP header */
1768 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1769 + sizeof(struct tcphdr);
1770
1771 /* Add more stuff to the end of skb->len */
1772 if (!(flags & MSG_OOB))
1773 {
1774 copy = min(sk->mss - (skb->len - hdrlen), len);
1775 /* FIXME: this is really a bug. */
1776 if (copy <= 0)
1777 {
1778 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1779 copy = 0;
1780 }
1781
1782 memcpy_fromfs(skb_put(skb,copy), from, copy);
1783 from += copy;
1784 copied += copy;
1785 len -= copy;
1786 sk->write_seq += copy;
1787 }
1788 if ((skb->len - hdrlen) >= sk->mss ||
1789 (flags & MSG_OOB) || !sk->packets_out)
1790 tcp_send_skb(sk, skb);
1791 else
1792 tcp_enqueue_partial(skb, sk);
1793 continue;
1794 }
1795
1796 /*
1797 * We also need to worry about the window.
1798 * If window < 1/2 the maximum window we've seen from this
1799 * host, don't use it. This is sender side
1800 * silly window prevention, as specified in RFC1122.
1801 * (Note that this is different than earlier versions of
1802 * SWS prevention, e.g. RFC813.). What we actually do is
1803 * use the whole MSS. Since the results in the right
1804 * edge of the packet being outside the window, it will
1805 * be queued for later rather than sent.
1806 */
1807
1808 copy = sk->window_seq - sk->write_seq;
1809 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1810 copy = sk->mss;
1811 if (copy > len)
1812 copy = len;
1813
1814 /*
1815 * We should really check the window here also.
1816 */
1817
1818 send_tmp = NULL;
1819 if (copy < sk->mss && !(flags & MSG_OOB))
1820 {
1821 /*
1822 * We will release the socket in case we sleep here.
1823 */
1824 release_sock(sk);
1825 /*
1826 * NB: following must be mtu, because mss can be increased.
1827 * mss is always <= mtu
1828 */
1829 skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1830 sk->inuse = 1;
1831 send_tmp = skb;
1832 }
1833 else
1834 {
1835 /*
1836 * We will release the socket in case we sleep here.
1837 */
1838 release_sock(sk);
1839 skb = prot->wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1840 sk->inuse = 1;
1841 }
1842
1843 /*
1844 * If we didn't get any memory, we need to sleep.
1845 */
1846
1847 if (skb == NULL)
1848 {
1849 sk->socket->flags |= SO_NOSPACE;
1850 if (nonblock)
1851 {
1852 release_sock(sk);
1853 if (copied)
1854 return(copied);
1855 return(-EAGAIN);
1856 }
1857
1858 /*
1859 * FIXME: here is another race condition.
1860 */
1861
1862 tmp = sk->wmem_alloc;
1863 release_sock(sk);
1864 cli();
1865 /*
1866 * Again we will try to avoid it.
1867 */
1868 if (tmp <= sk->wmem_alloc &&
1869 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1870 && sk->err == 0)
1871 {
1872 sk->socket->flags &= ~SO_NOSPACE;
1873 interruptible_sleep_on(sk->sleep);
1874 if (current->signal & ~current->blocked)
1875 {
1876 sti();
1877 if (copied)
1878 return(copied);
1879 return(-ERESTARTSYS);
1880 }
1881 }
1882 sk->inuse = 1;
1883 sti();
1884 continue;
1885 }
1886
1887 skb->sk = sk;
1888 skb->free = 0;
1889 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1890
1891 /*
1892 * FIXME: we need to optimize this.
1893 * Perhaps some hints here would be good.
1894 */
1895
1896 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1897 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1898 if (tmp < 0 )
1899 {
1900 prot->wfree(sk, skb);
1901 release_sock(sk);
1902 if (copied)
1903 return(copied);
1904 return(tmp);
1905 }
1906 skb->dev = dev;
1907 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1908 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1909 if (tmp < 0)
1910 {
1911 prot->wfree(sk, skb);
1912 release_sock(sk);
1913 if (copied)
1914 return(copied);
1915 return(tmp);
1916 }
1917
1918 if (flags & MSG_OOB)
1919 {
1920 skb->h.th->urg = 1;
1921 skb->h.th->urg_ptr = ntohs(copy);
1922 }
1923
1924 memcpy_fromfs(skb_put(skb,copy), from, copy);
1925
1926 from += copy;
1927 copied += copy;
1928 len -= copy;
1929 skb->free = 0;
1930 sk->write_seq += copy;
1931
1932 if (send_tmp != NULL && sk->packets_out)
1933 {
1934 tcp_enqueue_partial(send_tmp, sk);
1935 continue;
1936 }
1937 tcp_send_skb(sk, skb);
1938 }
1939 sk->err = 0;
1940
1941 /*
1942 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1943 * interactive fast network servers. It's meant to be on and
1944 * it really improves the throughput though not the echo time
1945 * on my slow slip link - Alan
1946 */
1947
1948 /*
1949 * Avoid possible race on send_tmp - c/o Johannes Stille
1950 */
1951
1952 if(sk->partial && ((!sk->packets_out)
1953 /* If not nagling we can send on the before case too.. */
1954 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1955 ))
1956 tcp_send_partial(sk);
1957
1958 release_sock(sk);
1959 return(copied);
1960 }
1961
1962 /*
1963 * This is just a wrapper.
1964 */
1965
1966 static int tcp_sendto(struct sock *sk, const unsigned char *from,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1967 int len, int nonblock, unsigned flags,
1968 struct sockaddr_in *addr, int addr_len)
1969 {
1970 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1971 return -EINVAL;
1972 if (sk->state == TCP_CLOSE)
1973 return -ENOTCONN;
1974 if (addr_len < sizeof(*addr))
1975 return -EINVAL;
1976 if (addr->sin_family && addr->sin_family != AF_INET)
1977 return -EINVAL;
1978 if (addr->sin_port != sk->dummy_th.dest)
1979 return -EISCONN;
1980 if (addr->sin_addr.s_addr != sk->daddr)
1981 return -EISCONN;
1982 return tcp_write(sk, from, len, nonblock, flags);
1983 }
1984
1985
1986 /*
1987 * Send an ack if one is backlogged at this point. Ought to merge
1988 * this with tcp_send_ack().
1989 */
1990
1991 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1992 {
1993 int tmp;
1994 struct device *dev = NULL;
1995 struct tcphdr *t1;
1996 struct sk_buff *buff;
1997
1998 if (!sk->ack_backlog)
1999 return;
2000
2001 /*
2002 * If we're closed, don't send an ack, or we'll get a RST
2003 * from the closed destination.
2004 */
2005 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2006 return;
2007
2008 /*
2009 * FIXME: we need to put code here to prevent this routine from
2010 * being called. Being called once in a while is ok, so only check
2011 * if this is the second time in a row.
2012 */
2013
2014 /*
2015 * We need to grab some memory, and put together an ack,
2016 * and then put it into the queue to be sent.
2017 */
2018
2019 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2020 if (buff == NULL)
2021 {
2022 /* Try again real soon. */
2023 reset_xmit_timer(sk, TIME_WRITE, HZ);
2024 return;
2025 }
2026
2027 buff->sk = sk;
2028 buff->localroute = sk->localroute;
2029
2030 /*
2031 * Put in the IP header and routing stuff.
2032 */
2033
2034 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2035 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2036 if (tmp < 0)
2037 {
2038 buff->free = 1;
2039 sk->prot->wfree(sk, buff);
2040 return;
2041 }
2042
2043 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2044
2045 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2046 t1->seq = htonl(sk->sent_seq);
2047 t1->ack = 1;
2048 t1->res1 = 0;
2049 t1->res2 = 0;
2050 t1->rst = 0;
2051 t1->urg = 0;
2052 t1->syn = 0;
2053 t1->psh = 0;
2054 sk->ack_backlog = 0;
2055 sk->bytes_rcv = 0;
2056 sk->window = tcp_select_window(sk);
2057 t1->window = ntohs(sk->window);
2058 t1->ack_seq = ntohl(sk->acked_seq);
2059 t1->doff = sizeof(*t1)/4;
2060 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2061 sk->prot->queue_xmit(sk, dev, buff, 1);
2062 tcp_statistics.TcpOutSegs++;
2063 }
2064
2065
2066 /*
2067 * FIXME:
2068 * This routine frees used buffers.
2069 * It should consider sending an ACK to let the
2070 * other end know we now have a bigger window.
2071 */
2072
2073 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2074 {
2075 unsigned long flags;
2076 unsigned long left;
2077 struct sk_buff *skb;
2078 unsigned long rspace;
2079
2080 if(sk->debug)
2081 printk("cleaning rbuf for sk=%p\n", sk);
2082
2083 save_flags(flags);
2084 cli();
2085
2086 left = sk->prot->rspace(sk);
2087
2088 /*
2089 * We have to loop through all the buffer headers,
2090 * and try to free up all the space we can.
2091 */
2092
2093 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2094 {
2095 if (!skb->used || skb->users)
2096 break;
2097 skb_unlink(skb);
2098 skb->sk = sk;
2099 kfree_skb(skb, FREE_READ);
2100 }
2101
2102 restore_flags(flags);
2103
2104 /*
2105 * FIXME:
2106 * At this point we should send an ack if the difference
2107 * in the window, and the amount of space is bigger than
2108 * TCP_WINDOW_DIFF.
2109 */
2110
2111 if(sk->debug)
2112 printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk),
2113 left);
2114 if ((rspace=sk->prot->rspace(sk)) != left)
2115 {
2116 /*
2117 * This area has caused the most trouble. The current strategy
2118 * is to simply do nothing if the other end has room to send at
2119 * least 3 full packets, because the ack from those will auto-
2120 * matically update the window. If the other end doesn't think
2121 * we have much space left, but we have room for at least 1 more
2122 * complete packet than it thinks we do, we will send an ack
2123 * immediately. Otherwise we will wait up to .5 seconds in case
2124 * the user reads some more.
2125 */
2126 sk->ack_backlog++;
2127 /*
2128 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
2129 * if the other end is offering a window smaller than the agreed on MSS
2130 * (called sk->mtu here). In theory there's no connection between send
2131 * and receive, and so no reason to think that they're going to send
2132 * small packets. For the moment I'm using the hack of reducing the mss
2133 * only on the send side, so I'm putting mtu here.
2134 */
2135
2136 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2137 {
2138 /* Send an ack right now. */
2139 tcp_read_wakeup(sk);
2140 }
2141 else
2142 {
2143 /* Force it to send an ack soon. */
2144 int was_active = del_timer(&sk->retransmit_timer);
2145 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2146 {
2147 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2148 }
2149 else
2150 add_timer(&sk->retransmit_timer);
2151 }
2152 }
2153 }
2154
2155
2156 /*
2157 * Handle reading urgent data. BSD has very simple semantics for
2158 * this, no blocking and very strange errors 8)
2159 */
2160
2161 static int tcp_read_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2162 unsigned char *to, int len, unsigned flags)
2163 {
2164 /*
2165 * No URG data to read
2166 */
2167 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2168 return -EINVAL; /* Yes this is right ! */
2169
2170 if (sk->err)
2171 {
2172 int tmp = -sk->err;
2173 sk->err = 0;
2174 return tmp;
2175 }
2176
2177 if (sk->state == TCP_CLOSE || sk->done)
2178 {
2179 if (!sk->done) {
2180 sk->done = 1;
2181 return 0;
2182 }
2183 return -ENOTCONN;
2184 }
2185
2186 if (sk->shutdown & RCV_SHUTDOWN)
2187 {
2188 sk->done = 1;
2189 return 0;
2190 }
2191 sk->inuse = 1;
2192 if (sk->urg_data & URG_VALID)
2193 {
2194 char c = sk->urg_data;
2195 if (!(flags & MSG_PEEK))
2196 sk->urg_data = URG_READ;
2197 put_fs_byte(c, to);
2198 release_sock(sk);
2199 return 1;
2200 }
2201 release_sock(sk);
2202
2203 /*
2204 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
2205 * the available implementations agree in this case:
2206 * this call should never block, independent of the
2207 * blocking state of the socket.
2208 * Mike <pall@rz.uni-karlsruhe.de>
2209 */
2210 return -EAGAIN;
2211 }
2212
2213
2214 /*
2215 * This routine copies from a sock struct into the user buffer.
2216 */
2217
2218 static int tcp_read(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2219 int len, int nonblock, unsigned flags)
2220 {
2221 struct wait_queue wait = { current, NULL };
2222 int copied = 0;
2223 u32 peek_seq;
2224 volatile u32 *seq; /* So gcc doesn't overoptimise */
2225 unsigned long used;
2226
2227 /*
2228 * This error should be checked.
2229 */
2230
2231 if (sk->state == TCP_LISTEN)
2232 return -ENOTCONN;
2233
2234 /*
2235 * Urgent data needs to be handled specially.
2236 */
2237
2238 if (flags & MSG_OOB)
2239 return tcp_read_urg(sk, nonblock, to, len, flags);
2240
2241 /*
2242 * Copying sequence to update. This is volatile to handle
2243 * the multi-reader case neatly (memcpy_to/fromfs might be
2244 * inline and thus not flush cached variables otherwise).
2245 */
2246
2247 peek_seq = sk->copied_seq;
2248 seq = &sk->copied_seq;
2249 if (flags & MSG_PEEK)
2250 seq = &peek_seq;
2251
2252 add_wait_queue(sk->sleep, &wait);
2253 sk->inuse = 1;
2254 while (len > 0)
2255 {
2256 struct sk_buff * skb;
2257 u32 offset;
2258
2259 /*
2260 * Are we at urgent data? Stop if we have read anything.
2261 */
2262
2263 if (copied && sk->urg_data && sk->urg_seq == *seq)
2264 break;
2265
2266 /*
2267 * Next get a buffer.
2268 */
2269
2270 current->state = TASK_INTERRUPTIBLE;
2271
2272 skb = skb_peek(&sk->receive_queue);
2273 do
2274 {
2275 if (!skb)
2276 break;
2277 if (before(*seq, skb->h.th->seq))
2278 break;
2279 offset = *seq - skb->h.th->seq;
2280 if (skb->h.th->syn)
2281 offset--;
2282 if (offset < skb->len)
2283 goto found_ok_skb;
2284 if (skb->h.th->fin)
2285 goto found_fin_ok;
2286 if (!(flags & MSG_PEEK))
2287 skb->used = 1;
2288 skb = skb->next;
2289 }
2290 while (skb != (struct sk_buff *)&sk->receive_queue);
2291
2292 if (copied)
2293 break;
2294
2295 if (sk->err)
2296 {
2297 copied = -sk->err;
2298 sk->err = 0;
2299 break;
2300 }
2301
2302 if (sk->state == TCP_CLOSE)
2303 {
2304 if (!sk->done)
2305 {
2306 sk->done = 1;
2307 break;
2308 }
2309 copied = -ENOTCONN;
2310 break;
2311 }
2312
2313 if (sk->shutdown & RCV_SHUTDOWN)
2314 {
2315 sk->done = 1;
2316 break;
2317 }
2318
2319 if (nonblock)
2320 {
2321 copied = -EAGAIN;
2322 break;
2323 }
2324
2325 cleanup_rbuf(sk);
2326 release_sock(sk);
2327 sk->socket->flags |= SO_WAITDATA;
2328 schedule();
2329 sk->socket->flags &= ~SO_WAITDATA;
2330 sk->inuse = 1;
2331
2332 if (current->signal & ~current->blocked)
2333 {
2334 copied = -ERESTARTSYS;
2335 break;
2336 }
2337 continue;
2338
2339 found_ok_skb:
2340 /*
2341 * Lock the buffer. We can be fairly relaxed as
2342 * an interrupt will never steal a buffer we are
2343 * using unless I've missed something serious in
2344 * tcp_data.
2345 */
2346
2347 skb->users++;
2348
2349 /*
2350 * Ok so how much can we use ?
2351 */
2352
2353 used = skb->len - offset;
2354 if (len < used)
2355 used = len;
2356 /*
2357 * Do we have urgent data here?
2358 */
2359
2360 if (sk->urg_data)
2361 {
2362 u32 urg_offset = sk->urg_seq - *seq;
2363 if (urg_offset < used)
2364 {
2365 if (!urg_offset)
2366 {
2367 if (!sk->urginline)
2368 {
2369 ++*seq;
2370 offset++;
2371 used--;
2372 }
2373 }
2374 else
2375 used = urg_offset;
2376 }
2377 }
2378
2379 /*
2380 * Copy it - We _MUST_ update *seq first so that we
2381 * don't ever double read when we have dual readers
2382 */
2383
2384 *seq += used;
2385
2386 /*
2387 * This memcpy_tofs can sleep. If it sleeps and we
2388 * do a second read it relies on the skb->users to avoid
2389 * a crash when cleanup_rbuf() gets called.
2390 */
2391
2392 memcpy_tofs(to,((unsigned char *)skb->h.th) +
2393 skb->h.th->doff*4 + offset, used);
2394 copied += used;
2395 len -= used;
2396 to += used;
2397
2398 /*
2399 * We now will not sleep again until we are finished
2400 * with skb. Sorry if you are doing the SMP port
2401 * but you'll just have to fix it neatly ;)
2402 */
2403
2404 skb->users --;
2405
2406 if (after(sk->copied_seq,sk->urg_seq))
2407 sk->urg_data = 0;
2408 if (used + offset < skb->len)
2409 continue;
2410
2411 /*
2412 * Process the FIN.
2413 */
2414
2415 if (skb->h.th->fin)
2416 goto found_fin_ok;
2417 if (flags & MSG_PEEK)
2418 continue;
2419 skb->used = 1;
2420 continue;
2421
2422 found_fin_ok:
2423 ++*seq;
2424 if (flags & MSG_PEEK)
2425 break;
2426
2427 /*
2428 * All is done
2429 */
2430
2431 skb->used = 1;
2432 sk->shutdown |= RCV_SHUTDOWN;
2433 break;
2434
2435 }
2436 remove_wait_queue(sk->sleep, &wait);
2437 current->state = TASK_RUNNING;
2438
2439 /* Clean up data we have read: This will do ACK frames */
2440 cleanup_rbuf(sk);
2441 release_sock(sk);
2442 return copied;
2443 }
2444
2445 /*
2446 * State processing on a close. This implements the state shift for
2447 * sending our FIN frame. Note that we only send a FIN for some
2448 * states. A shutdown() may have already sent the FIN, or we may be
2449 * closed.
2450 */
2451
2452 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2453 {
2454 int ns=TCP_CLOSE;
2455 int send_fin=0;
2456 switch(sk->state)
2457 {
2458 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2459 break;
2460 case TCP_SYN_RECV:
2461 case TCP_ESTABLISHED: /* Closedown begin */
2462 ns=TCP_FIN_WAIT1;
2463 send_fin=1;
2464 break;
2465 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2466 case TCP_FIN_WAIT2:
2467 case TCP_CLOSING:
2468 ns=sk->state;
2469 break;
2470 case TCP_CLOSE:
2471 case TCP_LISTEN:
2472 break;
2473 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2474 wait only for the ACK */
2475 ns=TCP_LAST_ACK;
2476 send_fin=1;
2477 }
2478
2479 tcp_set_state(sk,ns);
2480
2481 /*
2482 * This is a (useful) BSD violating of the RFC. There is a
2483 * problem with TCP as specified in that the other end could
2484 * keep a socket open forever with no application left this end.
2485 * We use a 3 minute timeout (about the same as BSD) then kill
2486 * our end. If they send after that then tough - BUT: long enough
2487 * that we won't make the old 4*rto = almost no time - whoops
2488 * reset mistake.
2489 */
2490 if(dead && ns==TCP_FIN_WAIT2)
2491 {
2492 int timer_active=del_timer(&sk->timer);
2493 if(timer_active)
2494 add_timer(&sk->timer);
2495 else
2496 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2497 }
2498
2499 return send_fin;
2500 }
2501
2502 /*
2503 * Send a fin.
2504 */
2505
2506 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2507 {
2508 struct proto *prot =(struct proto *)sk->prot;
2509 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2510 struct tcphdr *t1;
2511 struct sk_buff *buff;
2512 struct device *dev=NULL;
2513 int tmp;
2514
2515 release_sock(sk); /* in case the malloc sleeps. */
2516
2517 buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2518 sk->inuse = 1;
2519
2520 if (buff == NULL)
2521 {
2522 /* This is a disaster if it occurs */
2523 printk("tcp_send_fin: Impossible malloc failure");
2524 return;
2525 }
2526
2527 /*
2528 * Administrivia
2529 */
2530
2531 buff->sk = sk;
2532 buff->localroute = sk->localroute;
2533
2534 /*
2535 * Put in the IP header and routing stuff.
2536 */
2537
2538 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2539 IPPROTO_TCP, sk->opt,
2540 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2541 if (tmp < 0)
2542 {
2543 int t;
2544 /*
2545 * Finish anyway, treat this as a send that got lost.
2546 * (Not good).
2547 */
2548
2549 buff->free = 1;
2550 prot->wfree(sk,buff);
2551 sk->write_seq++;
2552 t=del_timer(&sk->timer);
2553 if(t)
2554 add_timer(&sk->timer);
2555 else
2556 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2557 return;
2558 }
2559
2560 /*
2561 * We ought to check if the end of the queue is a buffer and
2562 * if so simply add the fin to that buffer, not send it ahead.
2563 */
2564
2565 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2566 buff->dev = dev;
2567 memcpy(t1, th, sizeof(*t1));
2568 t1->seq = ntohl(sk->write_seq);
2569 sk->write_seq++;
2570 buff->h.seq = sk->write_seq;
2571 t1->ack = 1;
2572 t1->ack_seq = ntohl(sk->acked_seq);
2573 t1->window = ntohs(sk->window=tcp_select_window(sk));
2574 t1->fin = 1;
2575 t1->rst = 0;
2576 t1->doff = sizeof(*t1)/4;
2577 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2578
2579 /*
2580 * If there is data in the write queue, the fin must be appended to
2581 * the write queue.
2582 */
2583
2584 if (skb_peek(&sk->write_queue) != NULL)
2585 {
2586 buff->free = 0;
2587 if (buff->next != NULL)
2588 {
2589 printk("tcp_send_fin: next != NULL\n");
2590 skb_unlink(buff);
2591 }
2592 skb_queue_tail(&sk->write_queue, buff);
2593 }
2594 else
2595 {
2596 sk->sent_seq = sk->write_seq;
2597 sk->prot->queue_xmit(sk, dev, buff, 0);
2598 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2599 }
2600 }
2601
2602 /*
2603 * Shutdown the sending side of a connection. Much like close except
2604 * that we don't receive shut down or set sk->dead=1.
2605 */
2606
2607 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2608 {
2609 /*
2610 * We need to grab some memory, and put together a FIN,
2611 * and then put it into the queue to be sent.
2612 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2613 */
2614
2615 if (!(how & SEND_SHUTDOWN))
2616 return;
2617
2618 /*
2619 * If we've already sent a FIN, or it's a closed state
2620 */
2621
2622 if (sk->state == TCP_FIN_WAIT1 ||
2623 sk->state == TCP_FIN_WAIT2 ||
2624 sk->state == TCP_CLOSING ||
2625 sk->state == TCP_LAST_ACK ||
2626 sk->state == TCP_TIME_WAIT ||
2627 sk->state == TCP_CLOSE ||
2628 sk->state == TCP_LISTEN
2629 )
2630 {
2631 return;
2632 }
2633 sk->inuse = 1;
2634
2635 /*
2636 * flag that the sender has shutdown
2637 */
2638
2639 sk->shutdown |= SEND_SHUTDOWN;
2640
2641 /*
2642 * Clear out any half completed packets.
2643 */
2644
2645 if (sk->partial)
2646 tcp_send_partial(sk);
2647
2648 /*
2649 * FIN if needed
2650 */
2651
2652 if(tcp_close_state(sk,0))
2653 tcp_send_fin(sk);
2654
2655 release_sock(sk);
2656 }
2657
2658
2659 static int
2660 tcp_recvfrom(struct sock *sk, unsigned char *to,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2661 int to_len, int nonblock, unsigned flags,
2662 struct sockaddr_in *addr, int *addr_len)
2663 {
2664 int result;
2665
2666 /*
2667 * Have to check these first unlike the old code. If
2668 * we check them after we lose data on an error
2669 * which is wrong
2670 */
2671
2672 if(addr_len)
2673 *addr_len = sizeof(*addr);
2674 result=tcp_read(sk, to, to_len, nonblock, flags);
2675
2676 if (result < 0)
2677 return(result);
2678
2679 if(addr)
2680 {
2681 addr->sin_family = AF_INET;
2682 addr->sin_port = sk->dummy_th.dest;
2683 addr->sin_addr.s_addr = sk->daddr;
2684 }
2685 return(result);
2686 }
2687
2688
2689 /*
2690 * This routine will send an RST to the other tcp.
2691 */
2692
2693 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2694 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2695 {
2696 struct sk_buff *buff;
2697 struct tcphdr *t1;
2698 int tmp;
2699 struct device *ndev=NULL;
2700
2701 /*
2702 * Cannot reset a reset (Think about it).
2703 */
2704
2705 if(th->rst)
2706 return;
2707
2708 /*
2709 * We need to grab some memory, and put together an RST,
2710 * and then put it into the queue to be sent.
2711 */
2712
2713 buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2714 if (buff == NULL)
2715 return;
2716
2717 buff->sk = NULL;
2718 buff->dev = dev;
2719 buff->localroute = 0;
2720
2721 /*
2722 * Put in the IP header and routing stuff.
2723 */
2724
2725 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2726 sizeof(struct tcphdr),tos,ttl);
2727 if (tmp < 0)
2728 {
2729 buff->free = 1;
2730 prot->wfree(NULL, buff);
2731 return;
2732 }
2733
2734 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2735 memcpy(t1, th, sizeof(*t1));
2736
2737 /*
2738 * Swap the send and the receive.
2739 */
2740
2741 t1->dest = th->source;
2742 t1->source = th->dest;
2743 t1->rst = 1;
2744 t1->window = 0;
2745
2746 if(th->ack)
2747 {
2748 t1->ack = 0;
2749 t1->seq = th->ack_seq;
2750 t1->ack_seq = 0;
2751 }
2752 else
2753 {
2754 t1->ack = 1;
2755 if(!th->syn)
2756 t1->ack_seq=htonl(th->seq);
2757 else
2758 t1->ack_seq=htonl(th->seq+1);
2759 t1->seq=0;
2760 }
2761
2762 t1->syn = 0;
2763 t1->urg = 0;
2764 t1->fin = 0;
2765 t1->psh = 0;
2766 t1->doff = sizeof(*t1)/4;
2767 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2768 prot->queue_xmit(NULL, ndev, buff, 1);
2769 tcp_statistics.TcpOutSegs++;
2770 }
2771
2772
2773 /*
2774 * Look for tcp options. Parses everything but only knows about MSS.
2775 * This routine is always called with the packet containing the SYN.
2776 * However it may also be called with the ack to the SYN. So you
2777 * can't assume this is always the SYN. It's always called after
2778 * we have set up sk->mtu to our own MTU.
2779 *
2780 * We need at minimum to add PAWS support here. Possibly large windows
2781 * as Linux gets deployed on 100Mb/sec networks.
2782 */
2783
2784 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2785 {
2786 unsigned char *ptr;
2787 int length=(th->doff*4)-sizeof(struct tcphdr);
2788 int mss_seen = 0;
2789
2790 ptr = (unsigned char *)(th + 1);
2791
2792 while(length>0)
2793 {
2794 int opcode=*ptr++;
2795 int opsize=*ptr++;
2796 switch(opcode)
2797 {
2798 case TCPOPT_EOL:
2799 return;
2800 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2801 length--;
2802 ptr--; /* the opsize=*ptr++ above was a mistake */
2803 continue;
2804
2805 default:
2806 if(opsize<=2) /* Avoid silly options looping forever */
2807 return;
2808 switch(opcode)
2809 {
2810 case TCPOPT_MSS:
2811 if(opsize==4 && th->syn)
2812 {
2813 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2814 mss_seen = 1;
2815 }
2816 break;
2817 /* Add other options here as people feel the urge to implement stuff like large windows */
2818 }
2819 ptr+=opsize-2;
2820 length-=opsize;
2821 }
2822 }
2823 if (th->syn)
2824 {
2825 if (! mss_seen)
2826 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2827 }
2828 #ifdef CONFIG_INET_PCTCP
2829 sk->mss = min(sk->max_window >> 1, sk->mtu);
2830 #else
2831 sk->mss = min(sk->max_window, sk->mtu);
2832 #endif
2833 }
2834
2835 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2836 {
2837 dst = ntohl(dst);
2838 if (IN_CLASSA(dst))
2839 return htonl(IN_CLASSA_NET);
2840 if (IN_CLASSB(dst))
2841 return htonl(IN_CLASSB_NET);
2842 return htonl(IN_CLASSC_NET);
2843 }
2844
2845 /*
2846 * Default sequence number picking algorithm.
2847 * As close as possible to RFC 793, which
2848 * suggests using a 250kHz clock.
2849 * Further reading shows this assumes 2MB/s networks.
2850 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2851 * That's funny, Linux has one built in! Use it!
2852 */
2853
2854 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2855 {
2856 struct timeval tv;
2857 do_gettimeofday(&tv);
2858 return tv.tv_usec+tv.tv_sec*1000000;
2859 }
2860
2861 /*
2862 * This routine handles a connection request.
2863 * It should make sure we haven't already responded.
2864 * Because of the way BSD works, we have to send a syn/ack now.
2865 * This also means it will be harder to close a socket which is
2866 * listening.
2867 */
2868
2869 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2870 unsigned long daddr, unsigned long saddr,
2871 struct options *opt, struct device *dev, u32 seq)
2872 {
2873 struct sk_buff *buff;
2874 struct tcphdr *t1;
2875 unsigned char *ptr;
2876 struct sock *newsk;
2877 struct tcphdr *th;
2878 struct device *ndev=NULL;
2879 int tmp;
2880 struct rtable *rt;
2881
2882 th = skb->h.th;
2883
2884 /* If the socket is dead, don't accept the connection. */
2885 if (!sk->dead)
2886 {
2887 sk->data_ready(sk,0);
2888 }
2889 else
2890 {
2891 if(sk->debug)
2892 printk("Reset on %p: Connect on dead socket.\n",sk);
2893 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2894 tcp_statistics.TcpAttemptFails++;
2895 kfree_skb(skb, FREE_READ);
2896 return;
2897 }
2898
2899 /*
2900 * Make sure we can accept more. This will prevent a
2901 * flurry of syns from eating up all our memory.
2902 */
2903
2904 if (sk->ack_backlog >= sk->max_ack_backlog)
2905 {
2906 tcp_statistics.TcpAttemptFails++;
2907 kfree_skb(skb, FREE_READ);
2908 return;
2909 }
2910
2911 /*
2912 * We need to build a new sock struct.
2913 * It is sort of bad to have a socket without an inode attached
2914 * to it, but the wake_up's will just wake up the listening socket,
2915 * and if the listening socket is destroyed before this is taken
2916 * off of the queue, this will take care of it.
2917 */
2918
2919 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2920 if (newsk == NULL)
2921 {
2922 /* just ignore the syn. It will get retransmitted. */
2923 tcp_statistics.TcpAttemptFails++;
2924 kfree_skb(skb, FREE_READ);
2925 return;
2926 }
2927
2928 memcpy(newsk, sk, sizeof(*newsk));
2929 skb_queue_head_init(&newsk->write_queue);
2930 skb_queue_head_init(&newsk->receive_queue);
2931 newsk->send_head = NULL;
2932 newsk->send_tail = NULL;
2933 skb_queue_head_init(&newsk->back_log);
2934 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
2935 newsk->rto = TCP_TIMEOUT_INIT;
2936 newsk->mdev = 0;
2937 newsk->max_window = 0;
2938 newsk->cong_window = 1;
2939 newsk->cong_count = 0;
2940 newsk->ssthresh = 0;
2941 newsk->backoff = 0;
2942 newsk->blog = 0;
2943 newsk->intr = 0;
2944 newsk->proc = 0;
2945 newsk->done = 0;
2946 newsk->partial = NULL;
2947 newsk->pair = NULL;
2948 newsk->wmem_alloc = 0;
2949 newsk->rmem_alloc = 0;
2950 newsk->localroute = sk->localroute;
2951
2952 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2953
2954 newsk->err = 0;
2955 newsk->shutdown = 0;
2956 newsk->ack_backlog = 0;
2957 newsk->acked_seq = skb->h.th->seq+1;
2958 newsk->copied_seq = skb->h.th->seq+1;
2959 newsk->fin_seq = skb->h.th->seq;
2960 newsk->state = TCP_SYN_RECV;
2961 newsk->timeout = 0;
2962 newsk->ip_xmit_timeout = 0;
2963 newsk->write_seq = seq;
2964 newsk->window_seq = newsk->write_seq;
2965 newsk->rcv_ack_seq = newsk->write_seq;
2966 newsk->urg_data = 0;
2967 newsk->retransmits = 0;
2968 newsk->linger=0;
2969 newsk->destroy = 0;
2970 init_timer(&newsk->timer);
2971 newsk->timer.data = (unsigned long)newsk;
2972 newsk->timer.function = &net_timer;
2973 init_timer(&newsk->retransmit_timer);
2974 newsk->retransmit_timer.data = (unsigned long)newsk;
2975 newsk->retransmit_timer.function=&retransmit_timer;
2976 newsk->dummy_th.source = skb->h.th->dest;
2977 newsk->dummy_th.dest = skb->h.th->source;
2978
2979 /*
2980 * Swap these two, they are from our point of view.
2981 */
2982
2983 newsk->daddr = saddr;
2984 newsk->saddr = daddr;
2985
2986 put_sock(newsk->num,newsk);
2987 newsk->dummy_th.res1 = 0;
2988 newsk->dummy_th.doff = 6;
2989 newsk->dummy_th.fin = 0;
2990 newsk->dummy_th.syn = 0;
2991 newsk->dummy_th.rst = 0;
2992 newsk->dummy_th.psh = 0;
2993 newsk->dummy_th.ack = 0;
2994 newsk->dummy_th.urg = 0;
2995 newsk->dummy_th.res2 = 0;
2996 newsk->acked_seq = skb->h.th->seq + 1;
2997 newsk->copied_seq = skb->h.th->seq + 1;
2998 newsk->socket = NULL;
2999
3000 /*
3001 * Grab the ttl and tos values and use them
3002 */
3003
3004 newsk->ip_ttl=sk->ip_ttl;
3005 newsk->ip_tos=skb->ip_hdr->tos;
3006
3007 /*
3008 * Use 512 or whatever user asked for
3009 */
3010
3011 /*
3012 * Note use of sk->user_mss, since user has no direct access to newsk
3013 */
3014
3015 rt=ip_rt_route(saddr, NULL,NULL);
3016
3017 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3018 newsk->window_clamp = rt->rt_window;
3019 else
3020 newsk->window_clamp = 0;
3021
3022 if (sk->user_mss)
3023 newsk->mtu = sk->user_mss;
3024 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3025 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3026 else
3027 {
3028 #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */
3029 if ((saddr ^ daddr) & default_mask(saddr))
3030 #else
3031 if ((saddr ^ daddr) & dev->pa_mask)
3032 #endif
3033 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3034 else
3035 newsk->mtu = MAX_WINDOW;
3036 }
3037
3038 /*
3039 * But not bigger than device MTU
3040 */
3041
3042 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3043
3044 /*
3045 * This will min with what arrived in the packet
3046 */
3047
3048 tcp_options(newsk,skb->h.th);
3049
3050 tcp_cache_zap();
3051
3052 buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3053 if (buff == NULL)
3054 {
3055 sk->err = ENOMEM;
3056 newsk->dead = 1;
3057 newsk->state = TCP_CLOSE;
3058 /* And this will destroy it */
3059 release_sock(newsk);
3060 kfree_skb(skb, FREE_READ);
3061 tcp_statistics.TcpAttemptFails++;
3062 return;
3063 }
3064
3065 buff->sk = newsk;
3066 buff->localroute = newsk->localroute;
3067
3068 /*
3069 * Put in the IP header and routing stuff.
3070 */
3071
3072 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3073 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3074
3075 /*
3076 * Something went wrong.
3077 */
3078
3079 if (tmp < 0)
3080 {
3081 sk->err = tmp;
3082 buff->free = 1;
3083 kfree_skb(buff,FREE_WRITE);
3084 newsk->dead = 1;
3085 newsk->state = TCP_CLOSE;
3086 release_sock(newsk);
3087 skb->sk = sk;
3088 kfree_skb(skb, FREE_READ);
3089 tcp_statistics.TcpAttemptFails++;
3090 return;
3091 }
3092
3093 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3094
3095 memcpy(t1, skb->h.th, sizeof(*t1));
3096 buff->h.seq = newsk->write_seq;
3097 /*
3098 * Swap the send and the receive.
3099 */
3100 t1->dest = skb->h.th->source;
3101 t1->source = newsk->dummy_th.source;
3102 t1->seq = ntohl(newsk->write_seq++);
3103 t1->ack = 1;
3104 newsk->window = tcp_select_window(newsk);
3105 newsk->sent_seq = newsk->write_seq;
3106 t1->window = ntohs(newsk->window);
3107 t1->res1 = 0;
3108 t1->res2 = 0;
3109 t1->rst = 0;
3110 t1->urg = 0;
3111 t1->psh = 0;
3112 t1->syn = 1;
3113 t1->ack_seq = ntohl(skb->h.th->seq+1);
3114 t1->doff = sizeof(*t1)/4+1;
3115 ptr = skb_put(buff,4);
3116 ptr[0] = 2;
3117 ptr[1] = 4;
3118 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3119 ptr[3] =(newsk->mtu) & 0xff;
3120
3121 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3122 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3123 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3124 skb->sk = newsk;
3125
3126 /*
3127 * Charge the sock_buff to newsk.
3128 */
3129
3130 sk->rmem_alloc -= skb->truesize;
3131 newsk->rmem_alloc += skb->truesize;
3132
3133 skb_queue_tail(&sk->receive_queue,skb);
3134 sk->ack_backlog++;
3135 release_sock(newsk);
3136 tcp_statistics.TcpOutSegs++;
3137 }
3138
3139
3140 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3141 {
3142 /*
3143 * We need to grab some memory, and put together a FIN,
3144 * and then put it into the queue to be sent.
3145 */
3146
3147 sk->inuse = 1;
3148
3149 if(th_cache_sk==sk)
3150 tcp_cache_zap();
3151 if(sk->state == TCP_LISTEN)
3152 {
3153 /* Special case */
3154 tcp_set_state(sk, TCP_CLOSE);
3155 tcp_close_pending(sk);
3156 release_sock(sk);
3157 return;
3158 }
3159
3160 sk->keepopen = 1;
3161 sk->shutdown = SHUTDOWN_MASK;
3162
3163 if (!sk->dead)
3164 sk->state_change(sk);
3165
3166 if (timeout == 0)
3167 {
3168 struct sk_buff *skb;
3169
3170 /*
3171 * We need to flush the recv. buffs. We do this only on the
3172 * descriptor close, not protocol-sourced closes, because the
3173 * reader process may not have drained the data yet!
3174 */
3175
3176 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3177 kfree_skb(skb, FREE_READ);
3178 /*
3179 * Get rid off any half-completed packets.
3180 */
3181
3182 if (sk->partial)
3183 tcp_send_partial(sk);
3184 }
3185
3186
3187 /*
3188 * Timeout is not the same thing - however the code likes
3189 * to send both the same way (sigh).
3190 */
3191
3192 if(timeout)
3193 {
3194 tcp_set_state(sk, TCP_CLOSE); /* Dead */
3195 }
3196 else
3197 {
3198 if(tcp_close_state(sk,1)==1)
3199 {
3200 tcp_send_fin(sk);
3201 }
3202 }
3203 release_sock(sk);
3204 }
3205
3206
3207 /*
3208 * This routine takes stuff off of the write queue,
3209 * and puts it in the xmit queue. This happens as incoming acks
3210 * open up the remote window for us.
3211 */
3212
3213 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3214 {
3215 struct sk_buff *skb;
3216
3217 /*
3218 * The bytes will have to remain here. In time closedown will
3219 * empty the write queue and all will be happy
3220 */
3221
3222 if(sk->zapped)
3223 return;
3224
3225 /*
3226 * Anything on the transmit queue that fits the window can
3227 * be added providing we are not
3228 *
3229 * a) retransmitting (Nagle's rule)
3230 * b) exceeding our congestion window.
3231 */
3232
3233 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3234 before(skb->h.seq, sk->window_seq + 1) &&
3235 (sk->retransmits == 0 ||
3236 sk->ip_xmit_timeout != TIME_WRITE ||
3237 before(skb->h.seq, sk->rcv_ack_seq + 1))
3238 && sk->packets_out < sk->cong_window)
3239 {
3240 IS_SKB(skb);
3241 skb_unlink(skb);
3242
3243 /*
3244 * See if we really need to send the packet.
3245 */
3246
3247 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3248 {
3249 /*
3250 * This is acked data. We can discard it. This
3251 * cannot currently occur.
3252 */
3253
3254 sk->retransmits = 0;
3255 kfree_skb(skb, FREE_WRITE);
3256 if (!sk->dead)
3257 sk->write_space(sk);
3258 }
3259 else
3260 {
3261 struct tcphdr *th;
3262 struct iphdr *iph;
3263 int size;
3264 /*
3265 * put in the ack seq and window at this point rather than earlier,
3266 * in order to keep them monotonic. We really want to avoid taking
3267 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3268 * Ack and window will in general have changed since this packet was put
3269 * on the write queue.
3270 */
3271 iph = skb->ip_hdr;
3272 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3273 size = skb->len - (((unsigned char *) th) - skb->data);
3274
3275 th->ack_seq = ntohl(sk->acked_seq);
3276 th->window = ntohs(tcp_select_window(sk));
3277
3278 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3279
3280 sk->sent_seq = skb->h.seq;
3281
3282 /*
3283 * IP manages our queue for some crazy reason
3284 */
3285
3286 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3287
3288 /*
3289 * Again we slide the timer wrongly
3290 */
3291
3292 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3293 }
3294 }
3295 }
3296
3297
3298 /*
3299 * This routine deals with incoming acks, but not outgoing ones.
3300 */
3301
3302 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3303 {
3304 u32 ack;
3305 int flag = 0;
3306
3307 /*
3308 * 1 - there was data in packet as well as ack or new data is sent or
3309 * in shutdown state
3310 * 2 - data from retransmit queue was acked and removed
3311 * 4 - window shrunk or data from retransmit queue was acked and removed
3312 */
3313
3314 if(sk->zapped)
3315 return(1); /* Dead, cant ack any more so why bother */
3316
3317 /*
3318 * Have we discovered a larger window
3319 */
3320
3321 ack = ntohl(th->ack_seq);
3322
3323 if (ntohs(th->window) > sk->max_window)
3324 {
3325 sk->max_window = ntohs(th->window);
3326 #ifdef CONFIG_INET_PCTCP
3327 /* Hack because we don't send partial packets to non SWS
3328 handling hosts */
3329 sk->mss = min(sk->max_window>>1, sk->mtu);
3330 #else
3331 sk->mss = min(sk->max_window, sk->mtu);
3332 #endif
3333 }
3334
3335 /*
3336 * We have dropped back to keepalive timeouts. Thus we have
3337 * no retransmits pending.
3338 */
3339
3340 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3341 sk->retransmits = 0;
3342
3343 /*
3344 * If the ack is newer than sent or older than previous acks
3345 * then we can probably ignore it.
3346 */
3347
3348 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3349 {
3350 if(sk->debug)
3351 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3352
3353 /*
3354 * Keepalive processing.
3355 */
3356
3357 if (after(ack, sk->sent_seq))
3358 {
3359 return(0);
3360 }
3361
3362 /*
3363 * Restart the keepalive timer.
3364 */
3365
3366 if (sk->keepopen)
3367 {
3368 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3369 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3370 }
3371 return(1);
3372 }
3373
3374 /*
3375 * If there is data set flag 1
3376 */
3377
3378 if (len != th->doff*4)
3379 flag |= 1;
3380
3381 /*
3382 * See if our window has been shrunk.
3383 */
3384
3385 if (after(sk->window_seq, ack+ntohs(th->window)))
3386 {
3387 /*
3388 * We may need to move packets from the send queue
3389 * to the write queue, if the window has been shrunk on us.
3390 * The RFC says you are not allowed to shrink your window
3391 * like this, but if the other end does, you must be able
3392 * to deal with it.
3393 */
3394 struct sk_buff *skb;
3395 struct sk_buff *skb2;
3396 struct sk_buff *wskb = NULL;
3397
3398 skb2 = sk->send_head;
3399 sk->send_head = NULL;
3400 sk->send_tail = NULL;
3401
3402 /*
3403 * This is an artifact of a flawed concept. We want one
3404 * queue and a smarter send routine when we send all.
3405 */
3406
3407 flag |= 4; /* Window changed */
3408
3409 sk->window_seq = ack + ntohs(th->window);
3410 cli();
3411 while (skb2 != NULL)
3412 {
3413 skb = skb2;
3414 skb2 = skb->link3;
3415 skb->link3 = NULL;
3416 if (after(skb->h.seq, sk->window_seq))
3417 {
3418 if (sk->packets_out > 0)
3419 sk->packets_out--;
3420 /* We may need to remove this from the dev send list. */
3421 if (skb->next != NULL)
3422 {
3423 skb_unlink(skb);
3424 }
3425 /* Now add it to the write_queue. */
3426 if (wskb == NULL)
3427 skb_queue_head(&sk->write_queue,skb);
3428 else
3429 skb_append(wskb,skb);
3430 wskb = skb;
3431 }
3432 else
3433 {
3434 if (sk->send_head == NULL)
3435 {
3436 sk->send_head = skb;
3437 sk->send_tail = skb;
3438 }
3439 else
3440 {
3441 sk->send_tail->link3 = skb;
3442 sk->send_tail = skb;
3443 }
3444 skb->link3 = NULL;
3445 }
3446 }
3447 sti();
3448 }
3449
3450 /*
3451 * Pipe has emptied
3452 */
3453
3454 if (sk->send_tail == NULL || sk->send_head == NULL)
3455 {
3456 sk->send_head = NULL;
3457 sk->send_tail = NULL;
3458 sk->packets_out= 0;
3459 }
3460
3461 /*
3462 * Update the right hand window edge of the host
3463 */
3464
3465 sk->window_seq = ack + ntohs(th->window);
3466
3467 /*
3468 * We don't want too many packets out there.
3469 */
3470
3471 if (sk->ip_xmit_timeout == TIME_WRITE &&
3472 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3473 {
3474 /*
3475 * This is Jacobson's slow start and congestion avoidance.
3476 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3477 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3478 * counter and increment it once every cwnd times. It's possible
3479 * that this should be done only if sk->retransmits == 0. I'm
3480 * interpreting "new data is acked" as including data that has
3481 * been retransmitted but is just now being acked.
3482 */
3483 if (sk->cong_window < sk->ssthresh)
3484 /*
3485 * In "safe" area, increase
3486 */
3487 sk->cong_window++;
3488 else
3489 {
3490 /*
3491 * In dangerous area, increase slowly. In theory this is
3492 * sk->cong_window += 1 / sk->cong_window
3493 */
3494 if (sk->cong_count >= sk->cong_window)
3495 {
3496 sk->cong_window++;
3497 sk->cong_count = 0;
3498 }
3499 else
3500 sk->cong_count++;
3501 }
3502 }
3503
3504 /*
3505 * Remember the highest ack received.
3506 */
3507
3508 sk->rcv_ack_seq = ack;
3509
3510 /*
3511 * If this ack opens up a zero window, clear backoff. It was
3512 * being used to time the probes, and is probably far higher than
3513 * it needs to be for normal retransmission.
3514 */
3515
3516 if (sk->ip_xmit_timeout == TIME_PROBE0)
3517 {
3518 sk->retransmits = 0; /* Our probe was answered */
3519
3520 /*
3521 * Was it a usable window open ?
3522 */
3523
3524 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3525 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3526 {
3527 sk->backoff = 0;
3528
3529 /*
3530 * Recompute rto from rtt. this eliminates any backoff.
3531 */
3532
3533 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3534 if (sk->rto > 120*HZ)
3535 sk->rto = 120*HZ;
3536 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3537 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3538 .2 of a second is going to need huge windows (SIGH) */
3539 sk->rto = 20;
3540 }
3541 }
3542
3543 /*
3544 * See if we can take anything off of the retransmit queue.
3545 */
3546
3547 while(sk->send_head != NULL)
3548 {
3549 /* Check for a bug. */
3550 if (sk->send_head->link3 &&
3551 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3552 printk("INET: tcp.c: *** bug send_list out of order.\n");
3553
3554 /*
3555 * If our packet is before the ack sequence we can
3556 * discard it as it's confirmed to have arrived the other end.
3557 */
3558
3559 if (before(sk->send_head->h.seq, ack+1))
3560 {
3561 struct sk_buff *oskb;
3562 if (sk->retransmits)
3563 {
3564 /*
3565 * We were retransmitting. don't count this in RTT est
3566 */
3567 flag |= 2;
3568
3569 /*
3570 * even though we've gotten an ack, we're still
3571 * retransmitting as long as we're sending from
3572 * the retransmit queue. Keeping retransmits non-zero
3573 * prevents us from getting new data interspersed with
3574 * retransmissions.
3575 */
3576
3577 if (sk->send_head->link3) /* Any more queued retransmits? */
3578 sk->retransmits = 1;
3579 else
3580 sk->retransmits = 0;
3581 }
3582 /*
3583 * Note that we only reset backoff and rto in the
3584 * rtt recomputation code. And that doesn't happen
3585 * if there were retransmissions in effect. So the
3586 * first new packet after the retransmissions is
3587 * sent with the backoff still in effect. Not until
3588 * we get an ack from a non-retransmitted packet do
3589 * we reset the backoff and rto. This allows us to deal
3590 * with a situation where the network delay has increased
3591 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3592 */
3593
3594 /*
3595 * We have one less packet out there.
3596 */
3597
3598 if (sk->packets_out > 0)
3599 sk->packets_out --;
3600 /*
3601 * Wake up the process, it can probably write more.
3602 */
3603 if (!sk->dead)
3604 sk->write_space(sk);
3605 oskb = sk->send_head;
3606
3607 if (!(flag&2)) /* Not retransmitting */
3608 {
3609 long m;
3610
3611 /*
3612 * The following amusing code comes from Jacobson's
3613 * article in SIGCOMM '88. Note that rtt and mdev
3614 * are scaled versions of rtt and mean deviation.
3615 * This is designed to be as fast as possible
3616 * m stands for "measurement".
3617 */
3618
3619 m = jiffies - oskb->when; /* RTT */
3620 if(m<=0)
3621 m=1; /* IS THIS RIGHT FOR <0 ??? */
3622 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3623 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3624 if (m < 0)
3625 m = -m; /* m is now abs(error) */
3626 m -= (sk->mdev >> 2); /* similar update on mdev */
3627 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3628
3629 /*
3630 * Now update timeout. Note that this removes any backoff.
3631 */
3632
3633 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3634 if (sk->rto > 120*HZ)
3635 sk->rto = 120*HZ;
3636 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3637 sk->rto = 20;
3638 sk->backoff = 0;
3639 }
3640 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3641 In this case as we just set it up */
3642 cli();
3643 oskb = sk->send_head;
3644 IS_SKB(oskb);
3645 sk->send_head = oskb->link3;
3646 if (sk->send_head == NULL)
3647 {
3648 sk->send_tail = NULL;
3649 }
3650
3651 /*
3652 * We may need to remove this from the dev send list.
3653 */
3654
3655 if (oskb->next)
3656 skb_unlink(oskb);
3657 sti();
3658 kfree_skb(oskb, FREE_WRITE); /* write. */
3659 if (!sk->dead)
3660 sk->write_space(sk);
3661 }
3662 else
3663 {
3664 break;
3665 }
3666 }
3667
3668 /*
3669 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3670 * returns non-NULL, we complete ignore the timer stuff in the else
3671 * clause. We ought to organize the code so that else clause can
3672 * (should) be executed regardless, possibly moving the PROBE timer
3673 * reset over. The skb_peek() thing should only move stuff to the
3674 * write queue, NOT also manage the timer functions.
3675 */
3676
3677 /*
3678 * Maybe we can take some stuff off of the write queue,
3679 * and put it onto the xmit queue.
3680 */
3681 if (skb_peek(&sk->write_queue) != NULL)
3682 {
3683 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3684 (sk->retransmits == 0 ||
3685 sk->ip_xmit_timeout != TIME_WRITE ||
3686 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3687 && sk->packets_out < sk->cong_window)
3688 {
3689 /*
3690 * Add more data to the send queue.
3691 */
3692 flag |= 1;
3693 tcp_write_xmit(sk);
3694 }
3695 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3696 sk->send_head == NULL &&
3697 sk->ack_backlog == 0 &&
3698 sk->state != TCP_TIME_WAIT)
3699 {
3700 /*
3701 * Data to queue but no room.
3702 */
3703 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3704 }
3705 }
3706 else
3707 {
3708 /*
3709 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3710 * from TCP_CLOSE we don't do anything
3711 *
3712 * from anything else, if there is write data (or fin) pending,
3713 * we use a TIME_WRITE timeout, else if keepalive we reset to
3714 * a KEEPALIVE timeout, else we delete the timer.
3715 *
3716 * We do not set flag for nominal write data, otherwise we may
3717 * force a state where we start to write itsy bitsy tidbits
3718 * of data.
3719 */
3720
3721 switch(sk->state) {
3722 case TCP_TIME_WAIT:
3723 /*
3724 * keep us in TIME_WAIT until we stop getting packets,
3725 * reset the timeout.
3726 */
3727 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3728 break;
3729 case TCP_CLOSE:
3730 /*
3731 * don't touch the timer.
3732 */
3733 break;
3734 default:
3735 /*
3736 * Must check send_head, write_queue, and ack_backlog
3737 * to determine which timeout to use.
3738 */
3739 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3740 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3741 } else if (sk->keepopen) {
3742 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3743 } else {
3744 del_timer(&sk->retransmit_timer);
3745 sk->ip_xmit_timeout = 0;
3746 }
3747 break;
3748 }
3749 }
3750
3751 /*
3752 * We have nothing queued but space to send. Send any partial
3753 * packets immediately (end of Nagle rule application).
3754 */
3755
3756 if (sk->packets_out == 0 && sk->partial != NULL &&
3757 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3758 {
3759 flag |= 1;
3760 tcp_send_partial(sk);
3761 }
3762
3763 /*
3764 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3765 * we are now waiting for an acknowledge to our FIN. The other end is
3766 * already in TIME_WAIT.
3767 *
3768 * Move to TCP_CLOSE on success.
3769 */
3770
3771 if (sk->state == TCP_LAST_ACK)
3772 {
3773 if (!sk->dead)
3774 sk->state_change(sk);
3775 if(sk->debug)
3776 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3777 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3778 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3779 {
3780 flag |= 1;
3781 tcp_set_state(sk,TCP_CLOSE);
3782 sk->shutdown = SHUTDOWN_MASK;
3783 }
3784 }
3785
3786 /*
3787 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3788 *
3789 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3790 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3791 */
3792
3793 if (sk->state == TCP_FIN_WAIT1)
3794 {
3795
3796 if (!sk->dead)
3797 sk->state_change(sk);
3798 if (sk->rcv_ack_seq == sk->write_seq)
3799 {
3800 flag |= 1;
3801 sk->shutdown |= SEND_SHUTDOWN;
3802 tcp_set_state(sk, TCP_FIN_WAIT2);
3803 }
3804 }
3805
3806 /*
3807 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3808 *
3809 * Move to TIME_WAIT
3810 */
3811
3812 if (sk->state == TCP_CLOSING)
3813 {
3814
3815 if (!sk->dead)
3816 sk->state_change(sk);
3817 if (sk->rcv_ack_seq == sk->write_seq)
3818 {
3819 flag |= 1;
3820 tcp_time_wait(sk);
3821 }
3822 }
3823
3824 /*
3825 * Final ack of a three way shake
3826 */
3827
3828 if(sk->state==TCP_SYN_RECV)
3829 {
3830 tcp_set_state(sk, TCP_ESTABLISHED);
3831 tcp_options(sk,th);
3832 sk->dummy_th.dest=th->source;
3833 sk->copied_seq = sk->acked_seq;
3834 if(!sk->dead)
3835 sk->state_change(sk);
3836 if(sk->max_window==0)
3837 {
3838 sk->max_window=32; /* Sanity check */
3839 sk->mss=min(sk->max_window,sk->mtu);
3840 }
3841 }
3842
3843 /*
3844 * I make no guarantees about the first clause in the following
3845 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3846 * what conditions "!flag" would be true. However I think the rest
3847 * of the conditions would prevent that from causing any
3848 * unnecessary retransmission.
3849 * Clearly if the first packet has expired it should be
3850 * retransmitted. The other alternative, "flag&2 && retransmits", is
3851 * harder to explain: You have to look carefully at how and when the
3852 * timer is set and with what timeout. The most recent transmission always
3853 * sets the timer. So in general if the most recent thing has timed
3854 * out, everything before it has as well. So we want to go ahead and
3855 * retransmit some more. If we didn't explicitly test for this
3856 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3857 * would not be true. If you look at the pattern of timing, you can
3858 * show that rto is increased fast enough that the next packet would
3859 * almost never be retransmitted immediately. Then you'd end up
3860 * waiting for a timeout to send each packet on the retransmission
3861 * queue. With my implementation of the Karn sampling algorithm,
3862 * the timeout would double each time. The net result is that it would
3863 * take a hideous amount of time to recover from a single dropped packet.
3864 * It's possible that there should also be a test for TIME_WRITE, but
3865 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3866 * got to be in real retransmission mode.
3867 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3868 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3869 * As long as no further losses occur, this seems reasonable.
3870 */
3871
3872 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3873 (((flag&2) && sk->retransmits) ||
3874 (sk->send_head->when + sk->rto < jiffies)))
3875 {
3876 if(sk->send_head->when + sk->rto < jiffies)
3877 tcp_retransmit(sk,0);
3878 else
3879 {
3880 tcp_do_retransmit(sk, 1);
3881 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3882 }
3883 }
3884
3885 return(1);
3886 }
3887
3888
3889 /*
3890 * Process the FIN bit. This now behaves as it is supposed to work
3891 * and the FIN takes effect when it is validly part of sequence
3892 * space. Not before when we get holes.
3893 *
3894 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3895 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3896 * TIME-WAIT)
3897 *
3898 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3899 * close and we go into CLOSING (and later onto TIME-WAIT)
3900 *
3901 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3902 *
3903 */
3904
3905 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3906 {
3907 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3908
3909 if (!sk->dead)
3910 {
3911 sk->state_change(sk);
3912 sock_wake_async(sk->socket, 1);
3913 }
3914
3915 switch(sk->state)
3916 {
3917 case TCP_SYN_RECV:
3918 case TCP_SYN_SENT:
3919 case TCP_ESTABLISHED:
3920 /*
3921 * move to CLOSE_WAIT, tcp_data() already handled
3922 * sending the ack.
3923 */
3924 tcp_set_state(sk,TCP_CLOSE_WAIT);
3925 if (th->rst)
3926 sk->shutdown = SHUTDOWN_MASK;
3927 break;
3928
3929 case TCP_CLOSE_WAIT:
3930 case TCP_CLOSING:
3931 /*
3932 * received a retransmission of the FIN, do
3933 * nothing.
3934 */
3935 break;
3936 case TCP_TIME_WAIT:
3937 /*
3938 * received a retransmission of the FIN,
3939 * restart the TIME_WAIT timer.
3940 */
3941 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3942 return(0);
3943 case TCP_FIN_WAIT1:
3944 /*
3945 * This case occurs when a simultaneous close
3946 * happens, we must ack the received FIN and
3947 * enter the CLOSING state.
3948 *
3949 * This causes a WRITE timeout, which will either
3950 * move on to TIME_WAIT when we timeout, or resend
3951 * the FIN properly (maybe we get rid of that annoying
3952 * FIN lost hang). The TIME_WRITE code is already correct
3953 * for handling this timeout.
3954 */
3955
3956 if(sk->ip_xmit_timeout != TIME_WRITE)
3957 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3958 tcp_set_state(sk,TCP_CLOSING);
3959 break;
3960 case TCP_FIN_WAIT2:
3961 /*
3962 * received a FIN -- send ACK and enter TIME_WAIT
3963 */
3964 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3965 sk->shutdown|=SHUTDOWN_MASK;
3966 tcp_set_state(sk,TCP_TIME_WAIT);
3967 break;
3968 case TCP_CLOSE:
3969 /*
3970 * already in CLOSE
3971 */
3972 break;
3973 default:
3974 tcp_set_state(sk,TCP_LAST_ACK);
3975
3976 /* Start the timers. */
3977 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3978 return(0);
3979 }
3980
3981 return(0);
3982 }
3983
3984
3985
3986 /*
3987 * This routine handles the data. If there is room in the buffer,
3988 * it will be have already been moved into it. If there is no
3989 * room, then we will just have to discard the packet.
3990 */
3991
3992 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3993 unsigned long saddr, unsigned short len)
3994 {
3995 struct sk_buff *skb1, *skb2;
3996 struct tcphdr *th;
3997 int dup_dumped=0;
3998 u32 new_seq, shut_seq;
3999
4000 th = skb->h.th;
4001 skb_pull(skb,th->doff*4);
4002 skb_trim(skb,len-(th->doff*4));
4003
4004 /*
4005 * The bytes in the receive read/assembly queue has increased. Needed for the
4006 * low memory discard algorithm
4007 */
4008
4009 sk->bytes_rcv += skb->len;
4010
4011 if (skb->len == 0 && !th->fin)
4012 {
4013 /*
4014 * Don't want to keep passing ack's back and forth.
4015 * (someone sent us dataless, boring frame)
4016 */
4017 if (!th->ack)
4018 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4019 kfree_skb(skb, FREE_READ);
4020 return(0);
4021 }
4022
4023 /*
4024 * We no longer have anyone receiving data on this connection.
4025 */
4026
4027 #ifndef TCP_DONT_RST_SHUTDOWN
4028
4029 if(sk->shutdown & RCV_SHUTDOWN)
4030 {
4031 /*
4032 * FIXME: BSD has some magic to avoid sending resets to
4033 * broken 4.2 BSD keepalives. Much to my surprise a few non
4034 * BSD stacks still have broken keepalives so we want to
4035 * cope with it.
4036 */
4037
4038 if(skb->len) /* We don't care if it's just an ack or
4039 a keepalive/window probe */
4040 {
4041 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
4042
4043 /* Do this the way 4.4BSD treats it. Not what I'd
4044 regard as the meaning of the spec but it's what BSD
4045 does and clearly they know everything 8) */
4046
4047 /*
4048 * This is valid because of two things
4049 *
4050 * a) The way tcp_data behaves at the bottom.
4051 * b) A fin takes effect when read not when received.
4052 */
4053
4054 shut_seq=sk->acked_seq+1; /* Last byte */
4055
4056 if(after(new_seq,shut_seq))
4057 {
4058 if(sk->debug)
4059 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4060 sk, new_seq, shut_seq, sk->blog);
4061 if(sk->dead)
4062 {
4063 sk->acked_seq = new_seq + th->fin;
4064 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4065 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4066 tcp_statistics.TcpEstabResets++;
4067 tcp_set_state(sk,TCP_CLOSE);
4068 sk->err = EPIPE;
4069 sk->shutdown = SHUTDOWN_MASK;
4070 kfree_skb(skb, FREE_READ);
4071 return 0;
4072 }
4073 }
4074 }
4075 }
4076
4077 #endif
4078
4079 /*
4080 * Now we have to walk the chain, and figure out where this one
4081 * goes into it. This is set up so that the last packet we received
4082 * will be the first one we look at, that way if everything comes
4083 * in order, there will be no performance loss, and if they come
4084 * out of order we will be able to fit things in nicely.
4085 *
4086 * [AC: This is wrong. We should assume in order first and then walk
4087 * forwards from the first hole based upon real traffic patterns.]
4088 *
4089 */
4090
4091 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
4092 {
4093 skb_queue_head(&sk->receive_queue,skb);
4094 skb1= NULL;
4095 }
4096 else
4097 {
4098 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4099 {
4100 if(sk->debug)
4101 {
4102 printk("skb1=%p :", skb1);
4103 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4104 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4105 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4106 sk->acked_seq);
4107 }
4108
4109 /*
4110 * Optimisation: Duplicate frame or extension of previous frame from
4111 * same sequence point (lost ack case).
4112 * The frame contains duplicate data or replaces a previous frame
4113 * discard the previous frame (safe as sk->inuse is set) and put
4114 * the new one in its place.
4115 */
4116
4117 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4118 {
4119 skb_append(skb1,skb);
4120 skb_unlink(skb1);
4121 kfree_skb(skb1,FREE_READ);
4122 dup_dumped=1;
4123 skb1=NULL;
4124 break;
4125 }
4126
4127 /*
4128 * Found where it fits
4129 */
4130
4131 if (after(th->seq+1, skb1->h.th->seq))
4132 {
4133 skb_append(skb1,skb);
4134 break;
4135 }
4136
4137 /*
4138 * See if we've hit the start. If so insert.
4139 */
4140 if (skb1 == skb_peek(&sk->receive_queue))
4141 {
4142 skb_queue_head(&sk->receive_queue, skb);
4143 break;
4144 }
4145 }
4146 }
4147
4148 /*
4149 * Figure out what the ack value for this frame is
4150 */
4151
4152 th->ack_seq = th->seq + skb->len;
4153 if (th->syn)
4154 th->ack_seq++;
4155 if (th->fin)
4156 th->ack_seq++;
4157
4158 if (before(sk->acked_seq, sk->copied_seq))
4159 {
4160 printk("*** tcp.c:tcp_data bug acked < copied\n");
4161 sk->acked_seq = sk->copied_seq;
4162 }
4163
4164 /*
4165 * Now figure out if we can ack anything. This is very messy because we really want two
4166 * receive queues, a completed and an assembly queue. We also want only one transmit
4167 * queue.
4168 */
4169
4170 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
4171 {
4172 if (before(th->seq, sk->acked_seq+1))
4173 {
4174 int newwindow;
4175
4176 if (after(th->ack_seq, sk->acked_seq))
4177 {
4178 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4179 if (newwindow < 0)
4180 newwindow = 0;
4181 sk->window = newwindow;
4182 sk->acked_seq = th->ack_seq;
4183 }
4184 skb->acked = 1;
4185
4186 /*
4187 * When we ack the fin, we do the FIN
4188 * processing.
4189 */
4190
4191 if (skb->h.th->fin)
4192 {
4193 tcp_fin(skb,sk,skb->h.th);
4194 }
4195
4196 for(skb2 = skb->next;
4197 skb2 != (struct sk_buff *)&sk->receive_queue;
4198 skb2 = skb2->next)
4199 {
4200 if (before(skb2->h.th->seq, sk->acked_seq+1))
4201 {
4202 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4203 {
4204 newwindow = sk->window -
4205 (skb2->h.th->ack_seq - sk->acked_seq);
4206 if (newwindow < 0)
4207 newwindow = 0;
4208 sk->window = newwindow;
4209 sk->acked_seq = skb2->h.th->ack_seq;
4210 }
4211 skb2->acked = 1;
4212 /*
4213 * When we ack the fin, we do
4214 * the fin handling.
4215 */
4216 if (skb2->h.th->fin)
4217 {
4218 tcp_fin(skb,sk,skb->h.th);
4219 }
4220
4221 /*
4222 * Force an immediate ack.
4223 */
4224
4225 sk->ack_backlog = sk->max_ack_backlog;
4226 }
4227 else
4228 {
4229 break;
4230 }
4231 }
4232
4233 /*
4234 * This also takes care of updating the window.
4235 * This if statement needs to be simplified.
4236 */
4237 if (!sk->delay_acks ||
4238 sk->ack_backlog >= sk->max_ack_backlog ||
4239 sk->bytes_rcv > sk->max_unacked || th->fin) {
4240 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4241 }
4242 else
4243 {
4244 sk->ack_backlog++;
4245 if(sk->debug)
4246 printk("Ack queued.\n");
4247 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4248 }
4249 }
4250 }
4251
4252 /*
4253 * If we've missed a packet, send an ack.
4254 * Also start a timer to send another.
4255 */
4256
4257 if (!skb->acked)
4258 {
4259
4260 /*
4261 * This is important. If we don't have much room left,
4262 * we need to throw out a few packets so we have a good
4263 * window. Note that mtu is used, not mss, because mss is really
4264 * for the send side. He could be sending us stuff as large as mtu.
4265 */
4266
4267 while (sk->prot->rspace(sk) < sk->mtu)
4268 {
4269 skb1 = skb_peek(&sk->receive_queue);
4270 if (skb1 == NULL)
4271 {
4272 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4273 break;
4274 }
4275
4276 /*
4277 * Don't throw out something that has been acked.
4278 */
4279
4280 if (skb1->acked)
4281 {
4282 break;
4283 }
4284
4285 skb_unlink(skb1);
4286 kfree_skb(skb1, FREE_READ);
4287 }
4288 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4289 sk->ack_backlog++;
4290 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4291 }
4292 else
4293 {
4294 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4295 }
4296
4297 /*
4298 * Now tell the user we may have some data.
4299 */
4300
4301 if (!sk->dead)
4302 {
4303 if(sk->debug)
4304 printk("Data wakeup.\n");
4305 sk->data_ready(sk,0);
4306 }
4307 return(0);
4308 }
4309
4310
4311 /*
4312 * This routine is only called when we have urgent data
4313 * signalled. Its the 'slow' part of tcp_urg. It could be
4314 * moved inline now as tcp_urg is only called from one
4315 * place. We handle URGent data wrong. We have to - as
4316 * BSD still doesn't use the correction from RFC961.
4317 */
4318
4319 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4320 {
4321 u32 ptr = ntohs(th->urg_ptr);
4322
4323 if (ptr)
4324 ptr--;
4325 ptr += th->seq;
4326
4327 /* ignore urgent data that we've already seen and read */
4328 if (after(sk->copied_seq, ptr))
4329 return;
4330
4331 /* do we already have a newer (or duplicate) urgent pointer? */
4332 if (sk->urg_data && !after(ptr, sk->urg_seq))
4333 return;
4334
4335 /* tell the world about our new urgent pointer */
4336 if (sk->proc != 0) {
4337 if (sk->proc > 0) {
4338 kill_proc(sk->proc, SIGURG, 1);
4339 } else {
4340 kill_pg(-sk->proc, SIGURG, 1);
4341 }
4342 }
4343 sk->urg_data = URG_NOTYET;
4344 sk->urg_seq = ptr;
4345 }
4346
4347 /*
4348 * This is the 'fast' part of urgent handling.
4349 */
4350
4351 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4352 unsigned long saddr, unsigned long len)
4353 {
4354 u32 ptr;
4355
4356 /*
4357 * Check if we get a new urgent pointer - normally not
4358 */
4359
4360 if (th->urg)
4361 tcp_check_urg(sk,th);
4362
4363 /*
4364 * Do we wait for any urgent data? - normally not
4365 */
4366
4367 if (sk->urg_data != URG_NOTYET)
4368 return 0;
4369
4370 /*
4371 * Is the urgent pointer pointing into this packet?
4372 */
4373
4374 ptr = sk->urg_seq - th->seq + th->doff*4;
4375 if (ptr >= len)
4376 return 0;
4377
4378 /*
4379 * Ok, got the correct packet, update info
4380 */
4381
4382 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4383 if (!sk->dead)
4384 sk->data_ready(sk,0);
4385 return 0;
4386 }
4387
4388 /*
4389 * This will accept the next outstanding connection.
4390 */
4391
4392 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4393 {
4394 struct sock *newsk;
4395 struct sk_buff *skb;
4396
4397 /*
4398 * We need to make sure that this socket is listening,
4399 * and that it has something pending.
4400 */
4401
4402 if (sk->state != TCP_LISTEN)
4403 {
4404 sk->err = EINVAL;
4405 return(NULL);
4406 }
4407
4408 /* Avoid the race. */
4409 cli();
4410 sk->inuse = 1;
4411
4412 while((skb = tcp_dequeue_established(sk)) == NULL)
4413 {
4414 if (flags & O_NONBLOCK)
4415 {
4416 sti();
4417 release_sock(sk);
4418 sk->err = EAGAIN;
4419 return(NULL);
4420 }
4421
4422 release_sock(sk);
4423 interruptible_sleep_on(sk->sleep);
4424 if (current->signal & ~current->blocked)
4425 {
4426 sti();
4427 sk->err = ERESTARTSYS;
4428 return(NULL);
4429 }
4430 sk->inuse = 1;
4431 }
4432 sti();
4433
4434 /*
4435 * Now all we need to do is return skb->sk.
4436 */
4437
4438 newsk = skb->sk;
4439
4440 kfree_skb(skb, FREE_READ);
4441 sk->ack_backlog--;
4442 release_sock(sk);
4443 return(newsk);
4444 }
4445
4446
4447 /*
4448 * This will initiate an outgoing connection.
4449 */
4450
4451 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4452 {
4453 struct sk_buff *buff;
4454 struct device *dev=NULL;
4455 unsigned char *ptr;
4456 int tmp;
4457 int atype;
4458 struct tcphdr *t1;
4459 struct rtable *rt;
4460
4461 if (sk->state != TCP_CLOSE)
4462 {
4463 return(-EISCONN);
4464 }
4465
4466 if (addr_len < 8)
4467 return(-EINVAL);
4468
4469 if (usin->sin_family && usin->sin_family != AF_INET)
4470 return(-EAFNOSUPPORT);
4471
4472 /*
4473 * connect() to INADDR_ANY means loopback (BSD'ism).
4474 */
4475
4476 if(usin->sin_addr.s_addr==INADDR_ANY)
4477 usin->sin_addr.s_addr=ip_my_addr();
4478
4479 /*
4480 * Don't want a TCP connection going to a broadcast address
4481 */
4482
4483 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4484 return -ENETUNREACH;
4485
4486 sk->inuse = 1;
4487 sk->daddr = usin->sin_addr.s_addr;
4488 sk->write_seq = tcp_init_seq();
4489 sk->window_seq = sk->write_seq;
4490 sk->rcv_ack_seq = sk->write_seq -1;
4491 sk->err = 0;
4492 sk->dummy_th.dest = usin->sin_port;
4493 release_sock(sk);
4494
4495 buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4496 if (buff == NULL)
4497 {
4498 return(-ENOMEM);
4499 }
4500 sk->inuse = 1;
4501 buff->sk = sk;
4502 buff->free = 0;
4503 buff->localroute = sk->localroute;
4504
4505
4506 /*
4507 * Put in the IP header and routing stuff.
4508 */
4509
4510 rt=ip_rt_route(sk->daddr, NULL, NULL);
4511
4512
4513 /*
4514 * We need to build the routing stuff from the things saved in skb.
4515 */
4516
4517 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4518 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4519 if (tmp < 0)
4520 {
4521 sk->prot->wfree(sk, buff);
4522 release_sock(sk);
4523 return(-ENETUNREACH);
4524 }
4525
4526 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4527
4528 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4529 t1->seq = ntohl(sk->write_seq++);
4530 sk->sent_seq = sk->write_seq;
4531 buff->h.seq = sk->write_seq;
4532 t1->ack = 0;
4533 t1->window = 2;
4534 t1->res1=0;
4535 t1->res2=0;
4536 t1->rst = 0;
4537 t1->urg = 0;
4538 t1->psh = 0;
4539 t1->syn = 1;
4540 t1->urg_ptr = 0;
4541 t1->doff = 6;
4542 /* use 512 or whatever user asked for */
4543
4544 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4545 sk->window_clamp=rt->rt_window;
4546 else
4547 sk->window_clamp=0;
4548
4549 if (sk->user_mss)
4550 sk->mtu = sk->user_mss;
4551 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4552 sk->mtu = rt->rt_mss;
4553 else
4554 {
4555 #ifdef CONFIG_INET_SNARL
4556 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4557 #else
4558 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4559 #endif
4560 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4561 else
4562 sk->mtu = MAX_WINDOW;
4563 }
4564 /*
4565 * but not bigger than device MTU
4566 */
4567
4568 if(sk->mtu <32)
4569 sk->mtu = 32; /* Sanity limit */
4570
4571 sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4572
4573 /*
4574 * Put in the TCP options to say MTU.
4575 */
4576
4577 ptr = skb_put(buff,4);
4578 ptr[0] = 2;
4579 ptr[1] = 4;
4580 ptr[2] = (sk->mtu) >> 8;
4581 ptr[3] = (sk->mtu) & 0xff;
4582 tcp_send_check(t1, sk->saddr, sk->daddr,
4583 sizeof(struct tcphdr) + 4, sk);
4584
4585 /*
4586 * This must go first otherwise a really quick response will get reset.
4587 */
4588
4589 tcp_cache_zap();
4590 tcp_set_state(sk,TCP_SYN_SENT);
4591 if(rt&&rt->rt_flags&RTF_IRTT)
4592 sk->rto = rt->rt_irtt;
4593 else
4594 sk->rto = TCP_TIMEOUT_INIT;
4595 sk->retransmit_timer.function=&retransmit_timer;
4596 sk->retransmit_timer.data = (unsigned long)sk;
4597 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4598 sk->retransmits = 0; /* Now works the right way instead of a hacked initial setting */
4599
4600 sk->prot->queue_xmit(sk, dev, buff, 0);
4601 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4602 tcp_statistics.TcpActiveOpens++;
4603 tcp_statistics.TcpOutSegs++;
4604
4605 release_sock(sk);
4606 return(0);
4607 }
4608
4609
4610 /* This functions checks to see if the tcp header is actually acceptable. */
4611 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4612 struct options *opt, unsigned long saddr, struct device *dev)
4613 {
4614 u32 next_seq;
4615
4616 next_seq = len - 4*th->doff;
4617 if (th->fin)
4618 next_seq++;
4619 /* if we have a zero window, we can't have any data in the packet.. */
4620 if (next_seq && !sk->window)
4621 goto ignore_it;
4622 next_seq += th->seq;
4623
4624 /*
4625 * This isn't quite right. sk->acked_seq could be more recent
4626 * than sk->window. This is however close enough. We will accept
4627 * slightly more packets than we should, but it should not cause
4628 * problems unless someone is trying to forge packets.
4629 */
4630
4631 /* have we already seen all of this packet? */
4632 if (!after(next_seq+1, sk->acked_seq))
4633 goto ignore_it;
4634 /* or does it start beyond the window? */
4635 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4636 goto ignore_it;
4637
4638 /* ok, at least part of this packet would seem interesting.. */
4639 return 1;
4640
4641 ignore_it:
4642 if (th->rst)
4643 return 0;
4644
4645 /*
4646 * Send a reset if we get something not ours and we are
4647 * unsynchronized. Note: We don't do anything to our end. We
4648 * are just killing the bogus remote connection then we will
4649 * connect again and it will work (with luck).
4650 */
4651
4652 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4653 {
4654 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4655 return 1;
4656 }
4657
4658 /* Try to resync things. */
4659 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4660 return 0;
4661 }
4662
4663 /*
4664 * When we get a reset we do this.
4665 */
4666
4667 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4668 {
4669 sk->zapped = 1;
4670 sk->err = ECONNRESET;
4671 if (sk->state == TCP_SYN_SENT)
4672 sk->err = ECONNREFUSED;
4673 if (sk->state == TCP_CLOSE_WAIT)
4674 sk->err = EPIPE;
4675 #ifdef TCP_DO_RFC1337
4676 /*
4677 * Time wait assassination protection [RFC1337]
4678 */
4679 if(sk->state!=TCP_TIME_WAIT)
4680 {
4681 tcp_set_state(sk,TCP_CLOSE);
4682 sk->shutdown = SHUTDOWN_MASK;
4683 }
4684 #else
4685 tcp_set_state(sk,TCP_CLOSE);
4686 sk->shutdown = SHUTDOWN_MASK;
4687 #endif
4688 if (!sk->dead)
4689 sk->state_change(sk);
4690 kfree_skb(skb, FREE_READ);
4691 release_sock(sk);
4692 return(0);
4693 }
4694
4695 /*
4696 * A TCP packet has arrived.
4697 * skb->h.raw is the TCP header.
4698 */
4699
4700 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4701 __u32 daddr, unsigned short len,
4702 __u32 saddr, int redo, struct inet_protocol * protocol)
4703 {
4704 struct tcphdr *th;
4705 struct sock *sk;
4706 int syn_ok=0;
4707
4708 tcp_statistics.TcpInSegs++;
4709 if(skb->pkt_type!=PACKET_HOST)
4710 {
4711 kfree_skb(skb,FREE_READ);
4712 return(0);
4713 }
4714
4715 th = skb->h.th;
4716
4717 /*
4718 * Find the socket, using the last hit cache if applicable.
4719 */
4720
4721 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4722 sk=(struct sock *)th_cache_sk;
4723 else
4724 {
4725 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4726 th_cache_saddr=saddr;
4727 th_cache_daddr=daddr;
4728 th_cache_dport=th->dest;
4729 th_cache_sport=th->source;
4730 th_cache_sk=sk;
4731 }
4732
4733 /*
4734 * If this socket has got a reset it's to all intents and purposes
4735 * really dead. Count closed sockets as dead.
4736 *
4737 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4738 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4739 * exist so should cause resets as if the port was unreachable.
4740 */
4741
4742 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4743 sk=NULL;
4744
4745 if (!redo)
4746 {
4747 /*
4748 * Pull up the IP header.
4749 */
4750 skb_pull(skb, skb->h.raw-skb->data);
4751 /*
4752 * Try to use the device checksum if provided.
4753 */
4754 if (
4755 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4756 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4757 )
4758 {
4759 skb->sk = NULL;
4760 kfree_skb(skb,FREE_READ);
4761 /*
4762 * We don't release the socket because it was
4763 * never marked in use.
4764 */
4765 return(0);
4766 }
4767 th->seq = ntohl(th->seq);
4768
4769 /* See if we know about the socket. */
4770 if (sk == NULL)
4771 {
4772 /*
4773 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4774 */
4775 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4776 skb->sk = NULL;
4777 /*
4778 * Discard frame
4779 */
4780 kfree_skb(skb, FREE_READ);
4781 return(0);
4782 }
4783
4784 /* skb->len = len;*/
4785 skb->acked = 0;
4786 skb->used = 0;
4787 skb->free = 0;
4788 skb->saddr = daddr;
4789 skb->daddr = saddr;
4790
4791 /* We may need to add it to the backlog here. */
4792 cli();
4793 if (sk->inuse)
4794 {
4795 skb_queue_tail(&sk->back_log, skb);
4796 sti();
4797 return(0);
4798 }
4799 sk->inuse = 1;
4800 sti();
4801 }
4802 else
4803 {
4804 if (sk==NULL)
4805 {
4806 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4807 skb->sk = NULL;
4808 kfree_skb(skb, FREE_READ);
4809 return(0);
4810 }
4811 }
4812
4813
4814 if (!sk->prot)
4815 {
4816 printk("IMPOSSIBLE 3\n");
4817 return(0);
4818 }
4819
4820
4821 /*
4822 * Charge the memory to the socket.
4823 */
4824
4825 if (sk->rmem_alloc + skb->truesize >= sk->rcvbuf)
4826 {
4827 kfree_skb(skb, FREE_READ);
4828 release_sock(sk);
4829 return(0);
4830 }
4831
4832 skb->sk=sk;
4833 sk->rmem_alloc += skb->truesize;
4834
4835 /*
4836 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4837 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4838 * compatibility. We also set up variables more thoroughly [Karn notes in the
4839 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4840 */
4841
4842 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4843 {
4844
4845 /*
4846 * Now deal with unusual cases.
4847 */
4848
4849 if(sk->state==TCP_LISTEN)
4850 {
4851 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4852 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4853
4854 /*
4855 * We don't care for RST, and non SYN are absorbed (old segments)
4856 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4857 * netmask on a running connection it can go broadcast. Even Sun's have
4858 * this problem so I'm ignoring it
4859 */
4860
4861 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4862 {
4863 kfree_skb(skb, FREE_READ);
4864 release_sock(sk);
4865 return 0;
4866 }
4867
4868 /*
4869 * Guess we need to make a new socket up
4870 */
4871
4872 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4873
4874 /*
4875 * Now we have several options: In theory there is nothing else
4876 * in the frame. KA9Q has an option to send data with the syn,
4877 * BSD accepts data with the syn up to the [to be] advertised window
4878 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4879 * it, that fits the spec precisely and avoids incompatibilities. It
4880 * would be nice in future to drop through and process the data.
4881 */
4882
4883 release_sock(sk);
4884 return 0;
4885 }
4886
4887 /* retransmitted SYN? */
4888 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4889 {
4890 kfree_skb(skb, FREE_READ);
4891 release_sock(sk);
4892 return 0;
4893 }
4894
4895 /*
4896 * SYN sent means we have to look for a suitable ack and either reset
4897 * for bad matches or go to connected
4898 */
4899
4900 if(sk->state==TCP_SYN_SENT)
4901 {
4902 /* Crossed SYN or previous junk segment */
4903 if(th->ack)
4904 {
4905 /* We got an ack, but it's not a good ack */
4906 if(!tcp_ack(sk,th,saddr,len))
4907 {
4908 /* Reset the ack - its an ack from a
4909 different connection [ th->rst is checked in tcp_reset()] */
4910 tcp_statistics.TcpAttemptFails++;
4911 tcp_reset(daddr, saddr, th,
4912 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4913 kfree_skb(skb, FREE_READ);
4914 release_sock(sk);
4915 return(0);
4916 }
4917 if(th->rst)
4918 return tcp_std_reset(sk,skb);
4919 if(!th->syn)
4920 {
4921 /* A valid ack from a different connection
4922 start. Shouldn't happen but cover it */
4923 kfree_skb(skb, FREE_READ);
4924 release_sock(sk);
4925 return 0;
4926 }
4927 /*
4928 * Ok.. it's good. Set up sequence numbers and
4929 * move to established.
4930 */
4931 syn_ok=1; /* Don't reset this connection for the syn */
4932 sk->acked_seq=th->seq+1;
4933 sk->fin_seq=th->seq;
4934 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4935 tcp_set_state(sk, TCP_ESTABLISHED);
4936 tcp_options(sk,th);
4937 sk->dummy_th.dest=th->source;
4938 sk->copied_seq = sk->acked_seq;
4939 if(!sk->dead)
4940 {
4941 sk->state_change(sk);
4942 sock_wake_async(sk->socket, 0);
4943 }
4944 if(sk->max_window==0)
4945 {
4946 sk->max_window = 32;
4947 sk->mss = min(sk->max_window, sk->mtu);
4948 }
4949 }
4950 else
4951 {
4952 /* See if SYN's cross. Drop if boring */
4953 if(th->syn && !th->rst)
4954 {
4955 /* Crossed SYN's are fine - but talking to
4956 yourself is right out... */
4957 if(sk->saddr==saddr && sk->daddr==daddr &&
4958 sk->dummy_th.source==th->source &&
4959 sk->dummy_th.dest==th->dest)
4960 {
4961 tcp_statistics.TcpAttemptFails++;
4962 return tcp_std_reset(sk,skb);
4963 }
4964 tcp_set_state(sk,TCP_SYN_RECV);
4965
4966 /*
4967 * FIXME:
4968 * Must send SYN|ACK here
4969 */
4970 }
4971 /* Discard junk segment */
4972 kfree_skb(skb, FREE_READ);
4973 release_sock(sk);
4974 return 0;
4975 }
4976 /*
4977 * SYN_RECV with data maybe.. drop through
4978 */
4979 goto rfc_step6;
4980 }
4981
4982 /*
4983 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
4984 * a more complex suggestion for fixing these reuse issues in RFC1644
4985 * but not yet ready for general use. Also see RFC1379.
4986 */
4987
4988 #define BSD_TIME_WAIT
4989 #ifdef BSD_TIME_WAIT
4990 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
4991 after(th->seq, sk->acked_seq) && !th->rst)
4992 {
4993 u32 seq = sk->write_seq;
4994 if(sk->debug)
4995 printk("Doing a BSD time wait\n");
4996 tcp_statistics.TcpEstabResets++;
4997 sk->rmem_alloc -= skb->truesize;
4998 skb->sk = NULL;
4999 sk->err=ECONNRESET;
5000 tcp_set_state(sk, TCP_CLOSE);
5001 sk->shutdown = SHUTDOWN_MASK;
5002 release_sock(sk);
5003 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5004 if (sk && sk->state==TCP_LISTEN)
5005 {
5006 sk->inuse=1;
5007 skb->sk = sk;
5008 sk->rmem_alloc += skb->truesize;
5009 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5010 release_sock(sk);
5011 return 0;
5012 }
5013 kfree_skb(skb, FREE_READ);
5014 return 0;
5015 }
5016 #endif
5017 }
5018
5019 /*
5020 * We are now in normal data flow (see the step list in the RFC)
5021 * Note most of these are inline now. I'll inline the lot when
5022 * I have time to test it hard and look at what gcc outputs
5023 */
5024
5025 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5026 {
5027 kfree_skb(skb, FREE_READ);
5028 release_sock(sk);
5029 return 0;
5030 }
5031
5032 if(th->rst)
5033 return tcp_std_reset(sk,skb);
5034
5035 /*
5036 * !syn_ok is effectively the state test in RFC793.
5037 */
5038
5039 if(th->syn && !syn_ok)
5040 {
5041 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5042 return tcp_std_reset(sk,skb);
5043 }
5044
5045 /*
5046 * Process the ACK
5047 */
5048
5049
5050 if(th->ack && !tcp_ack(sk,th,saddr,len))
5051 {
5052 /*
5053 * Our three way handshake failed.
5054 */
5055
5056 if(sk->state==TCP_SYN_RECV)
5057 {
5058 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5059 }
5060 kfree_skb(skb, FREE_READ);
5061 release_sock(sk);
5062 return 0;
5063 }
5064
5065 rfc_step6: /* I'll clean this up later */
5066
5067 /*
5068 * Process urgent data
5069 */
5070
5071 if(tcp_urg(sk, th, saddr, len))
5072 {
5073 kfree_skb(skb, FREE_READ);
5074 release_sock(sk);
5075 return 0;
5076 }
5077
5078
5079 /*
5080 * Process the encapsulated data
5081 */
5082
5083 if(tcp_data(skb,sk, saddr, len))
5084 {
5085 kfree_skb(skb, FREE_READ);
5086 release_sock(sk);
5087 return 0;
5088 }
5089
5090 /*
5091 * And done
5092 */
5093
5094 release_sock(sk);
5095 return 0;
5096 }
5097
5098 /*
5099 * This routine sends a packet with an out of date sequence
5100 * number. It assumes the other end will try to ack it.
5101 */
5102
5103 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5104 {
5105 struct sk_buff *buff,*skb;
5106 struct tcphdr *t1;
5107 struct device *dev=NULL;
5108 int tmp;
5109
5110 if (sk->zapped)
5111 return; /* After a valid reset we can send no more */
5112
5113 /*
5114 * Write data can still be transmitted/retransmitted in the
5115 * following states. If any other state is encountered, return.
5116 * [listen/close will never occur here anyway]
5117 */
5118
5119 if (sk->state != TCP_ESTABLISHED &&
5120 sk->state != TCP_CLOSE_WAIT &&
5121 sk->state != TCP_FIN_WAIT1 &&
5122 sk->state != TCP_LAST_ACK &&
5123 sk->state != TCP_CLOSING
5124 )
5125 {
5126 return;
5127 }
5128 if ( before(sk->sent_seq, sk->window_seq) &&
5129 (skb=skb_peek(&sk->write_queue)))
5130 {
5131 /*
5132 * We are probing the opening of a window
5133 * but the window size is != 0
5134 * must have been a result SWS advoidance ( sender )
5135 */
5136
5137 struct iphdr *iph;
5138 struct tcphdr *th;
5139 struct tcphdr *nth;
5140 unsigned long win_size, ow_size;
5141 void * tcp_data_start;
5142
5143 /*
5144 * How many bytes can we send ?
5145 */
5146
5147 win_size = sk->window_seq - sk->sent_seq;
5148
5149 /*
5150 * Recover the buffer pointers
5151 */
5152
5153 iph = (struct iphdr *)skb->ip_hdr;
5154 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5155
5156 /*
5157 * Grab the data for a temporary frame
5158 */
5159
5160 buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 +
5161 (iph->ihl << 2) +
5162 sk->prot->max_header + 15,
5163 1, GFP_ATOMIC);
5164 if ( buff == NULL )
5165 return;
5166
5167 /*
5168 * If we strip the packet on the write queue we must
5169 * be ready to retransmit this one
5170 */
5171
5172 buff->free = /*0*/1;
5173
5174 buff->sk = sk;
5175 buff->localroute = sk->localroute;
5176
5177 /*
5178 * Put headers on the new packet
5179 */
5180
5181 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5182 IPPROTO_TCP, sk->opt, buff->truesize,
5183 sk->ip_tos,sk->ip_ttl);
5184 if (tmp < 0)
5185 {
5186 sk->prot->wfree(sk, buff);
5187 return;
5188 }
5189
5190 /*
5191 * Move the TCP header over
5192 */
5193
5194 buff->dev = dev;
5195
5196 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5197
5198 memcpy(nth, th, th->doff * 4);
5199
5200 /*
5201 * Correct the new header
5202 */
5203
5204 nth->ack = 1;
5205 nth->ack_seq = ntohl(sk->acked_seq);
5206 nth->window = ntohs(tcp_select_window(sk));
5207 nth->check = 0;
5208
5209 /*
5210 * Find the first data byte.
5211 */
5212
5213 tcp_data_start = skb->data + skb->dev->hard_header_len +
5214 (iph->ihl << 2) + th->doff * 4;
5215
5216 /*
5217 * Add it to our new buffer
5218 */
5219 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5220
5221 /*
5222 * Remember our right edge sequence number.
5223 */
5224
5225 buff->h.seq = sk->sent_seq + win_size;
5226 sk->sent_seq = buff->h.seq; /* Hack */
5227 #if 0
5228
5229 /*
5230 * now: shrink the queue head segment
5231 */
5232
5233 th->check = 0;
5234 ow_size = skb->len - win_size -
5235 ((unsigned long) (tcp_data_start - (void *) skb->data));
5236
5237 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5238 skb_trim(skb,skb->len-win_size);
5239 sk->sent_seq += win_size;
5240 th->seq = htonl(sk->sent_seq);
5241 if (th->urg)
5242 {
5243 unsigned short urg_ptr;
5244
5245 urg_ptr = ntohs(th->urg_ptr);
5246 if (urg_ptr <= win_size)
5247 th->urg = 0;
5248 else
5249 {
5250 urg_ptr -= win_size;
5251 th->urg_ptr = htons(urg_ptr);
5252 nth->urg_ptr = htons(win_size);
5253 }
5254 }
5255 #else
5256 if(th->urg && ntohs(th->urg_ptr) < win_size)
5257 nth->urg = 0;
5258 #endif
5259
5260 /*
5261 * Checksum the split buffer
5262 */
5263
5264 tcp_send_check(nth, sk->saddr, sk->daddr,
5265 nth->doff * 4 + win_size , sk);
5266 }
5267 else
5268 {
5269 buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5270 if (buff == NULL)
5271 return;
5272
5273 buff->free = 1;
5274 buff->sk = sk;
5275 buff->localroute = sk->localroute;
5276
5277 /*
5278 * Put in the IP header and routing stuff.
5279 */
5280
5281 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5282 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5283 if (tmp < 0)
5284 {
5285 sk->prot->wfree(sk, buff);
5286 return;
5287 }
5288
5289 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5290 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5291
5292 /*
5293 * Use a previous sequence.
5294 * This should cause the other end to send an ack.
5295 */
5296
5297 t1->seq = htonl(sk->sent_seq-1);
5298 t1->ack = 1;
5299 t1->res1= 0;
5300 t1->res2= 0;
5301 t1->rst = 0;
5302 t1->urg = 0;
5303 t1->psh = 0;
5304 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5305 t1->syn = 0;
5306 t1->ack_seq = ntohl(sk->acked_seq);
5307 t1->window = ntohs(tcp_select_window(sk));
5308 t1->doff = sizeof(*t1)/4;
5309 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5310
5311 }
5312
5313 /*
5314 * Send it.
5315 */
5316
5317 sk->prot->queue_xmit(sk, dev, buff, 1);
5318 tcp_statistics.TcpOutSegs++;
5319 }
5320
5321 /*
5322 * A window probe timeout has occurred.
5323 */
5324
5325 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5326 {
5327 if (sk->zapped)
5328 return; /* After a valid reset we can send no more */
5329
5330 tcp_write_wakeup(sk);
5331
5332 sk->backoff++;
5333 sk->rto = min(sk->rto << 1, 120*HZ);
5334 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5335 sk->retransmits++;
5336 sk->prot->retransmits ++;
5337 }
5338
5339 /*
5340 * Socket option code for TCP.
5341 */
5342
5343 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5344 {
5345 int val,err;
5346
5347 if(level!=SOL_TCP)
5348 return ip_setsockopt(sk,level,optname,optval,optlen);
5349
5350 if (optval == NULL)
5351 return(-EINVAL);
5352
5353 err=verify_area(VERIFY_READ, optval, sizeof(int));
5354 if(err)
5355 return err;
5356
5357 val = get_user((int *)optval);
5358
5359 switch(optname)
5360 {
5361 case TCP_MAXSEG:
5362 /*
5363 * values greater than interface MTU won't take effect. however at
5364 * the point when this call is done we typically don't yet know
5365 * which interface is going to be used
5366 */
5367 if(val<1||val>MAX_WINDOW)
5368 return -EINVAL;
5369 sk->user_mss=val;
5370 return 0;
5371 case TCP_NODELAY:
5372 sk->nonagle=(val==0)?0:1;
5373 return 0;
5374 default:
5375 return(-ENOPROTOOPT);
5376 }
5377 }
5378
5379 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5380 {
5381 int val,err;
5382
5383 if(level!=SOL_TCP)
5384 return ip_getsockopt(sk,level,optname,optval,optlen);
5385
5386 switch(optname)
5387 {
5388 case TCP_MAXSEG:
5389 val=sk->user_mss;
5390 break;
5391 case TCP_NODELAY:
5392 val=sk->nonagle;
5393 break;
5394 default:
5395 return(-ENOPROTOOPT);
5396 }
5397 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5398 if(err)
5399 return err;
5400 put_user(sizeof(int),(int *) optlen);
5401
5402 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5403 if(err)
5404 return err;
5405 put_user(val,(int *)optval);
5406
5407 return(0);
5408 }
5409
5410
5411 struct proto tcp_prot = {
5412 sock_wmalloc,
5413 sock_rmalloc,
5414 sock_wfree,
5415 sock_rfree,
5416 sock_rspace,
5417 sock_wspace,
5418 tcp_close,
5419 tcp_read,
5420 tcp_write,
5421 tcp_sendto,
5422 tcp_recvfrom,
5423 ip_build_header,
5424 tcp_connect,
5425 tcp_accept,
5426 ip_queue_xmit,
5427 tcp_retransmit,
5428 tcp_write_wakeup,
5429 tcp_read_wakeup,
5430 tcp_rcv,
5431 tcp_select,
5432 tcp_ioctl,
5433 NULL,
5434 tcp_shutdown,
5435 tcp_setsockopt,
5436 tcp_getsockopt,
5437 128,
5438 0,
5439 "TCP",
5440 0, 0,
5441 {NULL,}
5442 };