1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. select
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle select() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), select() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in selecting before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : Select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if stat is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathalogical case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 *
192 * To Fix:
193 * Fast path the code. Two things here - fix the window calculation
194 * so it doesn't iterate over the queue, also spot packets with no funny
195 * options arriving in order and process directly.
196 *
197 * Rewrite output state machine to use a single queue.
198 * Speed up input assembly algorithm.
199 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
200 * could do with it working on IPv4
201 * User settable/learned rtt/max window/mtu
202 *
203 * Change the fundamental structure to a single send queue maintained
204 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
205 * active routes too]). Cut the queue off in tcp_retransmit/
206 * tcp_transmit.
207 * Change the receive queue to assemble as it goes. This lets us
208 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
209 * tcp_data/tcp_read as well as the window shrink crud.
210 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
211 * tcp_queue_skb seem obvious routines to extract.
212 *
213 * This program is free software; you can redistribute it and/or
214 * modify it under the terms of the GNU General Public License
215 * as published by the Free Software Foundation; either version
216 * 2 of the License, or(at your option) any later version.
217 *
218 * Description of States:
219 *
220 * TCP_SYN_SENT sent a connection request, waiting for ack
221 *
222 * TCP_SYN_RECV received a connection request, sent ack,
223 * waiting for final ack in three-way handshake.
224 *
225 * TCP_ESTABLISHED connection established
226 *
227 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
228 * transmission of remaining buffered data
229 *
230 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
231 * to shutdown
232 *
233 * TCP_CLOSING both sides have shutdown but we still have
234 * data we have to finish sending
235 *
236 * TCP_TIME_WAIT timeout to catch resent junk before entering
237 * closed, can only be entered from FIN_WAIT2
238 * or CLOSING. Required because the other end
239 * may not have gotten our last ACK causing it
240 * to retransmit the data packet (which we ignore)
241 *
242 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
243 * us to finish writing our data and to shutdown
244 * (we have to close() to move on to LAST_ACK)
245 *
246 * TCP_LAST_ACK out side has shutdown after remote has
247 * shutdown. There may still be data in our
248 * buffer that we have to finish sending
249 *
250 * TCP_CLOSE socket is finished
251 */
252
253 /*
254 * RFC1122 status:
255 * NOTE: I'm not going to be doing comments in the code for this one except
256 * for violations and the like. tcp.c is just too big... If I say something
257 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
258 * with Alan. -- MS 950903
259 *
260 * Use of PSH (4.2.2.2)
261 * MAY aggregate data sent without the PSH flag. (does)
262 * MAY queue data received without the PSH flag. (does)
263 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
264 * MAY implement PSH on send calls. (doesn't, thus:)
265 * MUST NOT buffer data indefinitely (doesn't [1 second])
266 * MUST set PSH on last segment (does)
267 * MAY pass received PSH to application layer (doesn't)
268 * SHOULD send maximum-sized segment whenever possible. (almost always does)
269 *
270 * Window Size (4.2.2.3, 4.2.2.16)
271 * MUST treat window size as an unsigned number (does)
272 * SHOULD treat window size as a 32-bit number (does not)
273 * MUST NOT shrink window once it is offered (does not normally)
274 *
275 * Urgent Pointer (4.2.2.4)
276 * **MUST point urgent pointer to last byte of urgent data (not right
277 * after). (doesn't, to be like BSD)
278 * MUST inform application layer asynchronously of incoming urgent
279 * data. (does)
280 * MUST provide application with means of determining the amount of
281 * urgent data pending. (does)
282 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
283 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
284 * [Follows BSD 1 byte of urgent data]
285 *
286 * TCP Options (4.2.2.5)
287 * MUST be able to receive TCP options in any segment. (does)
288 * MUST ignore unsupported options (does)
289 *
290 * Maximum Segment Size Option (4.2.2.6)
291 * MUST implement both sending and receiving MSS. (does)
292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
293 * it always). (does, even when MSS == 536, which is legal)
294 * MUST assume MSS == 536 if no MSS received at connection setup (does)
295 * MUST calculate "effective send MSS" correctly:
296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
297 * (does - but allows operator override)
298 *
299 * TCP Checksum (4.2.2.7)
300 * MUST generate and check TCP checksum. (does)
301 *
302 * Initial Sequence Number Selection (4.2.2.8)
303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
305 * necessary for 10Mbps networks - and harder than BSD to spoof!)
306 *
307 * Simultaneous Open Attempts (4.2.2.10)
308 * MUST support simultaneous open attempts (does)
309 *
310 * Recovery from Old Duplicate SYN (4.2.2.11)
311 * MUST keep track of active vs. passive open (does)
312 *
313 * RST segment (4.2.2.12)
314 * SHOULD allow an RST segment to contain data (does, but doesn't do
315 * anything with it, which is standard)
316 *
317 * Closing a Connection (4.2.2.13)
318 * MUST inform application of whether connectin was closed by RST or
319 * normal close. (does)
320 * MAY allow "half-duplex" close (treat connection as closed for the
321 * local app, even before handshake is done). (does)
322 * MUST linger in TIME_WAIT for 2 * MSL (does)
323 *
324 * Retransmission Timeout (4.2.2.15)
325 * MUST implement Jacobson's slow start and congestion avoidance
326 * stuff. (does)
327 *
328 * Probing Zero Windows (4.2.2.17)
329 * MUST support probing of zero windows. (does)
330 * MAY keep offered window closed indefinitely. (does)
331 * MUST allow remote window to stay closed indefinitely. (does)
332 *
333 * Passive Open Calls (4.2.2.18)
334 * MUST NOT let new passive open affect other connections. (doesn't)
335 * MUST support passive opens (LISTENs) concurrently. (does)
336 *
337 * Time to Live (4.2.2.19)
338 * MUST make TCP TTL configurable. (does - IP_TTL option)
339 *
340 * Event Processing (4.2.2.20)
341 * SHOULD queue out-of-order segments. (does)
342 * MUST aggregate ACK segments whenever possible. (does but badly)
343 *
344 * Retransmission Timeout Calculation (4.2.3.1)
345 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
346 * calculation. (does, or at least explains them in the comments 8*b)
347 * SHOULD initialize RTO to 0 and RTT to 3. (does)
348 *
349 * When to Send an ACK Segment (4.2.3.2)
350 * SHOULD implement delayed ACK. (does)
351 * MUST keep ACK delay < 0.5 sec. (does)
352 *
353 * When to Send a Window Update (4.2.3.3)
354 * MUST implement receiver-side SWS. (does)
355 *
356 * When to Send Data (4.2.3.4)
357 * MUST implement sender-side SWS. (does)
358 * SHOULD implement Nagle algorithm. (does)
359 *
360 * TCP Connection Failures (4.2.3.5)
361 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
362 * SHOULD inform application layer of soft errors. (does)
363 *
364 * TCP Keep-Alives (4.2.3.6)
365 * MAY provide keep-alives. (does)
366 * MUST make keep-alives configurable on a per-connection basis. (does)
367 * MUST default to no keep-alives. (does)
368 * **MUST make keep-alive interval configurable. (doesn't)
369 * **MUST make default keep-alive interval > 2 hours. (doesn't)
370 * MUST NOT interpret failure to ACK keep-alive packet as dead
371 * connection. (doesn't)
372 * SHOULD send keep-alive with no data. (does)
373 *
374 * TCP Multihoming (4.2.3.7)
375 * MUST get source address from IP layer before sending first
376 * SYN. (does)
377 * MUST use same local address for all segments of a connection. (does)
378 *
379 * IP Options (4.2.3.8)
380 * MUST ignore unsupported IP options. (does)
381 * MAY support Time Stamp and Record Route. (does)
382 * MUST allow application to specify a source route. (does)
383 * MUST allow receieved Source Route option to set route for all future
384 * segments on this connection. (does not (security issues))
385 *
386 * ICMP messages (4.2.3.9)
387 * MUST act on ICMP errors. (does)
388 * MUST slow transmission upon receipt of a Source Quench. (does)
389 * MUST NOT abort connection upon receipt of soft Destination
390 * Unreachables (0, 1, 5), Time Exceededs and Parameter
391 * Problems. (doesn't)
392 * SHOULD report soft Destination Unreachables etc. to the
393 * application. (does)
394 * SHOULD abort connection upon receipt of hard Destination Unreachable
395 * messages (2, 3, 4). (does)
396 *
397 * Remote Address Validation (4.2.3.10)
398 * MUST reject as an error OPEN for invalid remote IP address. (does)
399 * MUST ignore SYN with invalid source address. (does)
400 * MUST silently discard incoming SYN for broadcast/multicast
401 * address. (does)
402 *
403 * Asynchronous Reports (4.2.4.1)
404 * MUST provide mechanism for reporting soft errors to application
405 * layer. (does)
406 *
407 * Type of Service (4.2.4.2)
408 * MUST allow application layer to set Type of Service. (does IP_TOS)
409 *
410 * (Whew. -- MS 950903)
411 **/
412
413 #include <linux/types.h>
414 #include <linux/sched.h>
415 #include <linux/mm.h>
416 #include <linux/time.h>
417 #include <linux/string.h>
418 #include <linux/config.h>
419 #include <linux/socket.h>
420 #include <linux/sockios.h>
421 #include <linux/termios.h>
422 #include <linux/in.h>
423 #include <linux/fcntl.h>
424 #include <linux/inet.h>
425 #include <linux/netdevice.h>
426 #include <net/snmp.h>
427 #include <net/ip.h>
428 #include <net/protocol.h>
429 #include <net/icmp.h>
430 #include <net/tcp.h>
431 #include <net/arp.h>
432 #include <linux/skbuff.h>
433 #include <net/sock.h>
434 #include <net/route.h>
435 #include <linux/errno.h>
436 #include <linux/timer.h>
437 #include <asm/system.h>
438 #include <asm/segment.h>
439 #include <linux/mm.h>
440 #include <net/checksum.h>
441
442 /*
443 * The MSL timer is the 'normal' timer.
444 */
445
446 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
447
448 #define SEQ_TICK 3
449 unsigned long seq_offset;
450 struct tcp_mib tcp_statistics;
451
452 /*
453 * Cached last hit socket
454 */
455
456 volatile unsigned long th_cache_saddr,th_cache_daddr;
457 volatile unsigned short th_cache_dport, th_cache_sport;
458 volatile struct sock *th_cache_sk;
459
460 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
461 {
462 unsigned long flags;
463 save_flags(flags);
464 cli();
465 th_cache_saddr=0;
466 th_cache_daddr=0;
467 th_cache_dport=0;
468 th_cache_sport=0;
469 th_cache_sk=NULL;
470 restore_flags(flags);
471 }
472
473 static void tcp_close(struct sock *sk, int timeout);
474 static void tcp_read_wakeup(struct sock *sk);
475
476 /*
477 * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;))
478 */
479
480 static struct wait_queue *master_select_wakeup;
481
482 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
483 {
484 if (a < b)
485 return(a);
486 return(b);
487 }
488
489 #undef STATE_TRACE
490
491 #ifdef STATE_TRACE
492 static char *statename[]={
493 "Unused","Established","Syn Sent","Syn Recv",
494 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
495 "Close Wait","Last ACK","Listen","Closing"
496 };
497 #endif
498
499 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
500 {
501 if(sk->state==TCP_ESTABLISHED)
502 tcp_statistics.TcpCurrEstab--;
503 #ifdef STATE_TRACE
504 if(sk->debug)
505 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
506 #endif
507 /* This is a hack but it doesn't occur often and it's going to
508 be a real to fix nicely */
509
510 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
511 {
512 wake_up_interruptible(&master_select_wakeup);
513 }
514 sk->state=state;
515 if(state==TCP_ESTABLISHED)
516 tcp_statistics.TcpCurrEstab++;
517 if(sk->state==TCP_CLOSE)
518 tcp_cache_zap();
519 }
520
521 /*
522 * This routine picks a TCP windows for a socket based on
523 * the following constraints
524 *
525 * 1. The window can never be shrunk once it is offered (RFC 793)
526 * 2. We limit memory per socket
527 */
528
529
530 static __inline__ unsigned short tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
531 {
532 long free_space = sock_rspace(sk);
533 long window = 0;
534
535 if (free_space > 1024)
536 free_space &= ~0x3FF; /* make free space a multiple of 1024 */
537
538 if(sk->window_clamp)
539 free_space = min(sk->window_clamp, free_space);
540
541 /*
542 * compute the actual window i.e.
543 * old_window - received_bytes_on_that_win
544 */
545
546 if (sk->mss == 0)
547 sk->mss = sk->mtu;
548
549 window = sk->window - (sk->acked_seq - sk->lastwin_seq);
550
551 if ( window < 0 ) {
552 window = 0;
553 printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
554 sk->window, sk->acked_seq, sk->lastwin_seq);
555 }
556
557 /*
558 * RFC 1122:
559 * "the suggested [SWS] avoidance algoritm for the receiver is to keep
560 * RECV.NEXT + RCV.WIN fixed until:
561 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
562 *
563 * i.e. don't raise the right edge of the window until you can't raise
564 * it MSS bytes
565 */
566
567 if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
568 window += ((free_space - window) / sk->mss) * sk->mss;
569
570 sk->window = window;
571 sk->lastwin_seq = sk->acked_seq;
572
573 return sk->window;
574 }
575
576 /*
577 * This function returns the amount that we can raise the
578 * usable window.
579 */
580
581 static __inline__ unsigned short tcp_raise_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
582 {
583 long free_space = sock_rspace(sk);
584 long window = 0;
585
586 if (free_space > 1024)
587 free_space &= ~0x3FF; /* make free space a multiple of 1024 */
588
589 if(sk->window_clamp)
590 free_space = min(sk->window_clamp, free_space);
591
592 /*
593 * compute the actual window i.e.
594 * old_window - received_bytes_on_that_win
595 */
596
597 window = sk->window - (sk->acked_seq - sk->lastwin_seq);
598
599 if (sk->mss == 0)
600 sk->mss = sk->mtu;
601
602 if ( window < 0 ) {
603 window = 0;
604 printk(KERN_DEBUG "TRW: win < 0 w=%d 1=%u 2=%u\n",
605 sk->window, sk->acked_seq, sk->lastwin_seq);
606 }
607
608 if ( (free_space - window) >= min(sk->mss, MAX_WINDOW/2) )
609 return ((free_space - window) / sk->mss) * sk->mss;
610
611 return 0;
612 }
613
614 /*
615 * Find someone to 'accept'. Must be called with
616 * sk->inuse=1 or cli()
617 */
618
619 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
620 {
621 struct sk_buff *p=skb_peek(&s->receive_queue);
622 if(p==NULL)
623 return NULL;
624 do
625 {
626 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
627 return p;
628 p=p->next;
629 }
630 while(p!=(struct sk_buff *)&s->receive_queue);
631 return NULL;
632 }
633
634 /*
635 * Remove a completed connection and return it. This is used by
636 * tcp_accept() to get connections from the queue.
637 */
638
639 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
640 {
641 struct sk_buff *skb;
642 unsigned long flags;
643 save_flags(flags);
644 cli();
645 skb=tcp_find_established(s);
646 if(skb!=NULL)
647 skb_unlink(skb); /* Take it off the queue */
648 restore_flags(flags);
649 return skb;
650 }
651
652 /*
653 * This routine closes sockets which have been at least partially
654 * opened, but not yet accepted. Currently it is only called by
655 * tcp_close, and timeout mirrors the value there.
656 */
657
658 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
659 {
660 struct sk_buff *skb;
661
662 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
663 {
664 skb->sk->dead=1;
665 tcp_close(skb->sk, 0);
666 kfree_skb(skb, FREE_READ);
667 }
668 return;
669 }
670
671 /*
672 * Enter the time wait state.
673 */
674
675 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
676 {
677 tcp_set_state(sk,TCP_TIME_WAIT);
678 sk->shutdown = SHUTDOWN_MASK;
679 if (!sk->dead)
680 sk->state_change(sk);
681 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
682 }
683
684 /*
685 * A socket has timed out on its send queue and wants to do a
686 * little retransmitting. Currently this means TCP.
687 */
688
689 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
690 {
691 struct sk_buff * skb;
692 struct proto *prot;
693 struct device *dev;
694 int ct=0;
695 struct rtable *rt;
696
697 prot = sk->prot;
698 skb = sk->send_head;
699
700 while (skb != NULL)
701 {
702 struct tcphdr *th;
703 struct iphdr *iph;
704 int size;
705
706 dev = skb->dev;
707 IS_SKB(skb);
708 skb->when = jiffies;
709
710 /* dl1bke 960201 - @%$$! Hope this cures strange race conditions */
711 /* with AX.25 mode VC. (esp. DAMA) */
712 /* if the buffer is locked we should not retransmit */
713 /* anyway, so we don't need all the fuss to prepare */
714 /* the buffer in this case. */
715 /* (the skb_pull() changes skb->data while we may */
716 /* actually try to send the data. Ough. A side */
717 /* effect is that we'll send some unnecessary data, */
718 /* but the alternative is desastrous... */
719
720 if (skb_device_locked(skb))
721 break;
722
723 /*
724 * Discard the surplus MAC header
725 */
726
727 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
728
729 /*
730 * In general it's OK just to use the old packet. However we
731 * need to use the current ack and window fields. Urg and
732 * urg_ptr could possibly stand to be updated as well, but we
733 * don't keep the necessary data. That shouldn't be a problem,
734 * if the other end is doing the right thing. Since we're
735 * changing the packet, we have to issue a new IP identifier.
736 */
737
738 iph = (struct iphdr *)skb->data;
739 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
740 size = ntohs(iph->tot_len) - (iph->ihl<<2);
741
742 /*
743 * Note: We ought to check for window limits here but
744 * currently this is done (less efficiently) elsewhere.
745 */
746
747 /*
748 * Put a MAC header back on (may cause ARPing)
749 */
750
751 {
752 /* ANK: UGLY, but the bug, that was here, should be fixed.
753 */
754 struct options * opt = (struct options*)skb->proto_priv;
755 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
756 }
757
758 iph->id = htons(ip_id_count++);
759 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
760 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
761 iph->frag_off &= ~htons(IP_DF);
762 #endif
763 ip_send_check(iph);
764
765 if (rt==NULL) /* Deep poo */
766 {
767 if(skb->sk)
768 {
769 skb->sk->err_soft=ENETUNREACH;
770 skb->sk->error_report(skb->sk);
771 }
772 }
773 else
774 {
775 dev=rt->rt_dev;
776 skb->raddr=rt->rt_gateway;
777 skb->dev=dev;
778 skb->arp=1;
779 if (rt->rt_hh)
780 {
781 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
782 if (!rt->rt_hh->hh_uptodate)
783 {
784 skb->arp = 0;
785 #if RT_CACHE_DEBUG >= 2
786 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
787 #endif
788 }
789 }
790 else if (dev->hard_header)
791 {
792 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
793 skb->arp=0;
794 }
795
796 /*
797 * This is not the right way to handle this. We have to
798 * issue an up to date window and ack report with this
799 * retransmit to keep the odd buggy tcp that relies on
800 * the fact BSD does this happy.
801 * We don't however need to recalculate the entire
802 * checksum, so someone wanting a small problem to play
803 * with might like to implement RFC1141/RFC1624 and speed
804 * this up by avoiding a full checksum.
805 */
806
807 th->ack_seq = htonl(sk->acked_seq);
808 sk->ack_backlog = 0;
809 sk->bytes_rcv = 0;
810 th->window = ntohs(tcp_select_window(sk));
811 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
812
813 /*
814 * If the interface is (still) up and running, kick it.
815 */
816
817 if (dev->flags & IFF_UP)
818 {
819 /*
820 * If the packet is still being sent by the device/protocol
821 * below then don't retransmit. This is both needed, and good -
822 * especially with connected mode AX.25 where it stops resends
823 * occurring of an as yet unsent anyway frame!
824 * We still add up the counts as the round trip time wants
825 * adjusting.
826 */
827 if (sk && !skb_device_locked(skb))
828 {
829 /* Remove it from any existing driver queue first! */
830 skb_unlink(skb);
831 /* Now queue it */
832 ip_statistics.IpOutRequests++;
833 dev_queue_xmit(skb, dev, sk->priority);
834 }
835 }
836 }
837
838 /*
839 * Count retransmissions
840 */
841
842 ct++;
843 sk->prot->retransmits ++;
844 tcp_statistics.TcpRetransSegs++;
845
846
847 /*
848 * Only one retransmit requested.
849 */
850
851 if (!all)
852 break;
853
854 /*
855 * This should cut it off before we send too many packets.
856 */
857
858 if (ct >= sk->cong_window)
859 break;
860 skb = skb->link3;
861 }
862 }
863
864 /*
865 * Reset the retransmission timer
866 */
867
868 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
869 {
870 del_timer(&sk->retransmit_timer);
871 sk->ip_xmit_timeout = why;
872 if((long)when < 0)
873 {
874 when=3;
875 printk("Error: Negative timer in xmit_timer\n");
876 }
877 sk->retransmit_timer.expires=jiffies+when;
878 add_timer(&sk->retransmit_timer);
879 }
880
881 /*
882 * This is the normal code called for timeouts. It does the retransmission
883 * and then does backoff. tcp_do_retransmit is separated out because
884 * tcp_ack needs to send stuff from the retransmit queue without
885 * initiating a backoff.
886 */
887
888
889 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
890 {
891 tcp_do_retransmit(sk, all);
892
893 /*
894 * Increase the timeout each time we retransmit. Note that
895 * we do not increase the rtt estimate. rto is initialized
896 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
897 * that doubling rto each time is the least we can get away with.
898 * In KA9Q, Karn uses this for the first few times, and then
899 * goes to quadratic. netBSD doubles, but only goes up to *64,
900 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
901 * defined in the protocol as the maximum possible RTT. I guess
902 * we'll have to use something other than TCP to talk to the
903 * University of Mars.
904 *
905 * PAWS allows us longer timeouts and large windows, so once
906 * implemented ftp to mars will work nicely. We will have to fix
907 * the 120 second clamps though!
908 */
909
910 sk->retransmits++;
911 sk->prot->retransmits++;
912 sk->backoff++;
913 sk->rto = min(sk->rto << 1, 120*HZ);
914 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
915 }
916
917
918 /*
919 * A timer event has trigger a tcp retransmit timeout. The
920 * socket xmit queue is ready and set up to send. Because
921 * the ack receive code keeps the queue straight we do
922 * nothing clever here.
923 */
924
925 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
926 {
927 if (all)
928 {
929 tcp_retransmit_time(sk, all);
930 return;
931 }
932
933 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
934 /* sk->ssthresh in theory can be zero. I guess that's OK */
935 sk->cong_count = 0;
936
937 sk->cong_window = 1;
938
939 /* Do the actual retransmit. */
940 tcp_retransmit_time(sk, all);
941 }
942
943 /*
944 * A write timeout has occurred. Process the after effects.
945 */
946
947 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
948 {
949 /*
950 * Look for a 'soft' timeout.
951 */
952 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
953 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
954 {
955 /*
956 * Attempt to recover if arp has changed (unlikely!) or
957 * a route has shifted (not supported prior to 1.3).
958 */
959 ip_rt_advice(&sk->ip_route_cache, 0);
960 }
961
962 /*
963 * Have we tried to SYN too many times (repent repent 8))
964 */
965
966 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
967 {
968 if(sk->err_soft)
969 sk->err=sk->err_soft;
970 else
971 sk->err=ETIMEDOUT;
972 sk->error_report(sk);
973 del_timer(&sk->retransmit_timer);
974 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
975 tcp_set_state(sk,TCP_CLOSE);
976 /* Don't FIN, we got nothing back */
977 release_sock(sk);
978 return 0;
979 }
980 /*
981 * Has it gone just too far ?
982 */
983 if (sk->retransmits > TCP_RETR2)
984 {
985 if(sk->err_soft)
986 sk->err = sk->err_soft;
987 else
988 sk->err = ETIMEDOUT;
989 sk->error_report(sk);
990 del_timer(&sk->retransmit_timer);
991 /*
992 * Time wait the socket
993 */
994 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
995 {
996 tcp_set_state(sk,TCP_TIME_WAIT);
997 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
998 }
999 else
1000 {
1001 /*
1002 * Clean up time.
1003 */
1004 tcp_set_state(sk, TCP_CLOSE);
1005 release_sock(sk);
1006 return 0;
1007 }
1008 }
1009 return 1;
1010 }
1011
1012 /*
1013 * The TCP retransmit timer. This lacks a few small details.
1014 *
1015 * 1. An initial rtt timeout on the probe0 should cause what we can
1016 * of the first write queue buffer to be split and sent.
1017 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
1018 * ETIMEDOUT if we know an additional 'soft' error caused this.
1019 * tcp_err should save a 'soft error' for us.
1020 */
1021
1022 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1023 {
1024 struct sock *sk = (struct sock*)data;
1025 int why = sk->ip_xmit_timeout;
1026
1027 /*
1028 * We are reset. We will send no more retransmits.
1029 */
1030
1031 if(sk->zapped)
1032 return;
1033
1034 /*
1035 * Only process if socket is not in use
1036 */
1037
1038 cli();
1039 if (sk->inuse || in_bh)
1040 {
1041 /* Try again in 1 second */
1042 sk->retransmit_timer.expires = jiffies+HZ;
1043 add_timer(&sk->retransmit_timer);
1044 sti();
1045 return;
1046 }
1047
1048 sk->inuse = 1;
1049 sti();
1050
1051
1052 if (sk->ack_backlog && !sk->dead)
1053 sk->data_ready(sk,0);
1054
1055 /* Now we need to figure out why the socket was on the timer. */
1056
1057 switch (why)
1058 {
1059 /* Window probing */
1060 case TIME_PROBE0:
1061 tcp_send_probe0(sk);
1062 tcp_write_timeout(sk);
1063 break;
1064 /* Retransmitting */
1065 case TIME_WRITE:
1066 /* It could be we got here because we needed to send an ack.
1067 * So we need to check for that.
1068 */
1069 {
1070 struct sk_buff *skb;
1071 unsigned long flags;
1072
1073 save_flags(flags);
1074 cli();
1075 skb = sk->send_head;
1076 if (!skb)
1077 {
1078 if (sk->ack_backlog)
1079 tcp_read_wakeup(sk);
1080 restore_flags(flags);
1081 }
1082 else
1083 {
1084 /*
1085 * Kicked by a delayed ack. Reset timer
1086 * correctly now
1087 */
1088 if (jiffies < skb->when + sk->rto)
1089 {
1090 if (sk->ack_backlog)
1091 tcp_read_wakeup(sk);
1092 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1093 restore_flags(flags);
1094 break;
1095 }
1096 restore_flags(flags);
1097 /*
1098 * Retransmission
1099 */
1100 sk->retransmits++;
1101 sk->prot->retransmits++;
1102 sk->prot->retransmit (sk, 0);
1103 tcp_write_timeout(sk);
1104 }
1105 break;
1106 }
1107 /* Sending Keepalives */
1108 case TIME_KEEPOPEN:
1109 /*
1110 * this reset_timer() call is a hack, this is not
1111 * how KEEPOPEN is supposed to work.
1112 */
1113 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1114
1115 /* Send something to keep the connection open. */
1116 if (sk->prot->write_wakeup)
1117 sk->prot->write_wakeup (sk);
1118 sk->retransmits++;
1119 sk->prot->retransmits++;
1120 tcp_write_timeout(sk);
1121 break;
1122 default:
1123 printk ("rexmit_timer: timer expired - reason unknown\n");
1124 break;
1125 }
1126 release_sock(sk);
1127 }
1128
1129 /*
1130 * This routine is called by the ICMP module when it gets some
1131 * sort of error condition. If err < 0 then the socket should
1132 * be closed and the error returned to the user. If err > 0
1133 * it's just the icmp type << 8 | icmp code. After adjustment
1134 * header points to the first 8 bytes of the tcp header. We need
1135 * to find the appropriate port.
1136 */
1137
1138 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1139 __u32 saddr, struct inet_protocol *protocol)
1140 {
1141 struct tcphdr *th = (struct tcphdr *)header;
1142 struct sock *sk;
1143
1144 /*
1145 * This one is _WRONG_. FIXME urgently.
1146 */
1147 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1148 struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1149 #endif
1150 th =(struct tcphdr *)header;
1151 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1152
1153 if (sk == NULL)
1154 return;
1155
1156 if (type == ICMP_SOURCE_QUENCH)
1157 {
1158 /*
1159 * FIXME:
1160 * For now we will just trigger a linear backoff.
1161 * The slow start code should cause a real backoff here.
1162 */
1163 if (sk->cong_window > 4)
1164 sk->cong_window--;
1165 return;
1166 }
1167
1168 if (type == ICMP_PARAMETERPROB)
1169 {
1170 sk->err=EPROTO;
1171 sk->error_report(sk);
1172 }
1173
1174 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1175 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1176 {
1177 struct rtable * rt;
1178 /*
1179 * Ugly trick to pass MTU to protocol layer.
1180 * Really we should add argument "info" to error handler.
1181 */
1182 unsigned short new_mtu = ntohs(iph->id);
1183
1184 if ((rt = sk->ip_route_cache) != NULL)
1185 if (rt->rt_mtu > new_mtu)
1186 rt->rt_mtu = new_mtu;
1187
1188 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
1189 && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
1190 sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1191
1192 return;
1193 }
1194 #endif
1195
1196 /*
1197 * If we've already connected we will keep trying
1198 * until we time out, or the user gives up.
1199 */
1200
1201 if (code < 13)
1202 {
1203 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1204 {
1205 sk->err = icmp_err_convert[code].errno;
1206 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1207 {
1208 tcp_statistics.TcpAttemptFails++;
1209 tcp_set_state(sk,TCP_CLOSE);
1210 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
1211 }
1212 }
1213 else /* Only an error on timeout */
1214 sk->err_soft = icmp_err_convert[code].errno;
1215 }
1216 }
1217
1218
1219 /*
1220 * Walk down the receive queue counting readable data until we hit the end or we find a gap
1221 * in the received data queue (ie a frame missing that needs sending to us). Not
1222 * sorting using two queues as data arrives makes life so much harder.
1223 */
1224
1225 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1226 {
1227 unsigned long counted;
1228 unsigned long amount;
1229 struct sk_buff *skb;
1230 int sum;
1231 unsigned long flags;
1232
1233 if(sk && sk->debug)
1234 printk("tcp_readable: %p - ",sk);
1235
1236 save_flags(flags);
1237 cli();
1238 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1239 {
1240 restore_flags(flags);
1241 if(sk && sk->debug)
1242 printk("empty\n");
1243 return(0);
1244 }
1245
1246 counted = sk->copied_seq; /* Where we are at the moment */
1247 amount = 0;
1248
1249 /*
1250 * Do until a push or until we are out of data.
1251 */
1252
1253 do
1254 {
1255 if (before(counted, skb->seq)) /* Found a hole so stops here */
1256 break;
1257 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */
1258 if (skb->h.th->syn)
1259 sum++;
1260 if (sum > 0)
1261 { /* Add it up, move on */
1262 amount += sum;
1263 if (skb->h.th->syn)
1264 amount--;
1265 counted += sum;
1266 }
1267 /*
1268 * Don't count urg data ... but do it in the right place!
1269 * Consider: "old_data (ptr is here) URG PUSH data"
1270 * The old code would stop at the first push because
1271 * it counted the urg (amount==1) and then does amount--
1272 * *after* the loop. This means tcp_readable() always
1273 * returned zero if any URG PUSH was in the queue, even
1274 * though there was normal data available. If we subtract
1275 * the urg data right here, we even get it to work for more
1276 * than one URG PUSH skb without normal data.
1277 * This means that select() finally works now with urg data
1278 * in the queue. Note that rlogin was never affected
1279 * because it doesn't use select(); it uses two processes
1280 * and a blocking read(). And the queue scan in tcp_read()
1281 * was correct. Mike <pall@rz.uni-karlsruhe.de>
1282 */
1283 if (skb->h.th->urg)
1284 amount--; /* don't count urg data */
1285 if (amount && skb->h.th->psh) break;
1286 skb = skb->next;
1287 }
1288 while(skb != (struct sk_buff *)&sk->receive_queue);
1289
1290 restore_flags(flags);
1291 if(sk->debug)
1292 printk("got %lu bytes.\n",amount);
1293 return(amount);
1294 }
1295
1296 /*
1297 * LISTEN is a special case for select..
1298 */
1299 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1300 {
1301 if (sel_type == SEL_IN) {
1302 int retval;
1303
1304 sk->inuse = 1;
1305 retval = (tcp_find_established(sk) != NULL);
1306 release_sock(sk);
1307 if (!retval)
1308 select_wait(&master_select_wakeup,wait);
1309 return retval;
1310 }
1311 return 0;
1312 }
1313
1314
1315 /*
1316 * Wait for a TCP event.
1317 *
1318 * Note that we don't need to set "sk->inuse", as the upper select layers
1319 * take care of normal races (between the test and the event) and we don't
1320 * go look at any of the socket buffers directly.
1321 */
1322 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1323 {
1324 if (sk->state == TCP_LISTEN)
1325 return tcp_listen_select(sk, sel_type, wait);
1326
1327 switch(sel_type) {
1328 case SEL_IN:
1329 if (sk->err)
1330 return 1;
1331 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1332 break;
1333
1334 if (sk->shutdown & RCV_SHUTDOWN)
1335 return 1;
1336
1337 if (sk->acked_seq == sk->copied_seq)
1338 break;
1339
1340 if (sk->urg_seq != sk->copied_seq ||
1341 sk->acked_seq != sk->copied_seq+1 ||
1342 sk->urginline || !sk->urg_data)
1343 return 1;
1344 break;
1345
1346 case SEL_OUT:
1347 if (sk->err)
1348 return 1;
1349 if (sk->shutdown & SEND_SHUTDOWN)
1350 return 0;
1351 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1352 break;
1353 /*
1354 * This is now right thanks to a small fix
1355 * by Matt Dillon.
1356 */
1357
1358 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1359 break;
1360 return 1;
1361
1362 case SEL_EX:
1363 if (sk->urg_data)
1364 return 1;
1365 break;
1366 }
1367 select_wait(sk->sleep, wait);
1368 return 0;
1369 }
1370
1371 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1372 {
1373 int err;
1374 switch(cmd)
1375 {
1376
1377 case TIOCINQ:
1378 #ifdef FIXME /* FIXME: */
1379 case FIONREAD:
1380 #endif
1381 {
1382 unsigned long amount;
1383
1384 if (sk->state == TCP_LISTEN)
1385 return(-EINVAL);
1386
1387 sk->inuse = 1;
1388 amount = tcp_readable(sk);
1389 release_sock(sk);
1390 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1391 if(err)
1392 return err;
1393 put_user(amount, (int *)arg);
1394 return(0);
1395 }
1396 case SIOCATMARK:
1397 {
1398 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1399
1400 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1401 if (err)
1402 return err;
1403 put_user(answ,(int *) arg);
1404 return(0);
1405 }
1406 case TIOCOUTQ:
1407 {
1408 unsigned long amount;
1409
1410 if (sk->state == TCP_LISTEN) return(-EINVAL);
1411 amount = sock_wspace(sk);
1412 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1413 if(err)
1414 return err;
1415 put_user(amount, (int *)arg);
1416 return(0);
1417 }
1418 default:
1419 return(-EINVAL);
1420 }
1421 }
1422
1423
1424 /*
1425 * This routine computes a TCP checksum.
1426 *
1427 * Modified January 1995 from a go-faster DOS routine by
1428 * Jorge Cwik <jorge@laser.satlink.net>
1429 */
1430
1431 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1432 unsigned long saddr, unsigned long daddr, unsigned long base)
1433 {
1434 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1435 }
1436
1437 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1438 unsigned long daddr, int len, struct sock *sk)
1439 {
1440 th->check = 0;
1441 th->check = tcp_check(th, len, saddr, daddr,
1442 csum_partial((char *)th,len,0));
1443 return;
1444 }
1445
1446 /*
1447 * This is the main buffer sending routine. We queue the buffer
1448 * having checked it is sane seeming.
1449 */
1450
1451 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1452 {
1453 int size;
1454 struct tcphdr * th = skb->h.th;
1455
1456 /*
1457 * length of packet (not counting length of pre-tcp headers)
1458 */
1459
1460 size = skb->len - ((unsigned char *) th - skb->data);
1461
1462 /*
1463 * Sanity check it..
1464 */
1465
1466 if (size < sizeof(struct tcphdr) || size > skb->len)
1467 {
1468 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1469 skb, skb->data, th, skb->len);
1470 kfree_skb(skb, FREE_WRITE);
1471 return;
1472 }
1473
1474 /*
1475 * If we have queued a header size packet.. (these crash a few
1476 * tcp stacks if ack is not set)
1477 */
1478
1479 if (size == sizeof(struct tcphdr))
1480 {
1481 /* If it's got a syn or fin it's notionally included in the size..*/
1482 if(!th->syn && !th->fin)
1483 {
1484 printk("tcp_send_skb: attempt to queue a bogon.\n");
1485 kfree_skb(skb,FREE_WRITE);
1486 return;
1487 }
1488 }
1489
1490 /*
1491 * Actual processing.
1492 */
1493
1494 tcp_statistics.TcpOutSegs++;
1495 skb->seq = ntohl(th->seq);
1496 skb->end_seq = skb->seq + size - 4*th->doff;
1497
1498 /*
1499 * We must queue if
1500 *
1501 * a) The right edge of this frame exceeds the window
1502 * b) We are retransmitting (Nagle's rule)
1503 * c) We have too many packets 'in flight'
1504 */
1505
1506 if (after(skb->end_seq, sk->window_seq) ||
1507 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1508 sk->packets_out >= sk->cong_window)
1509 {
1510 /* checksum will be supplied by tcp_write_xmit. So
1511 * we shouldn't need to set it at all. I'm being paranoid */
1512 th->check = 0;
1513 if (skb->next != NULL)
1514 {
1515 printk("tcp_send_partial: next != NULL\n");
1516 skb_unlink(skb);
1517 }
1518 skb_queue_tail(&sk->write_queue, skb);
1519
1520 /*
1521 * If we don't fit we have to start the zero window
1522 * probes. This is broken - we really need to do a partial
1523 * send _first_ (This is what causes the Cisco and PC/TCP
1524 * grief).
1525 */
1526
1527 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1528 sk->send_head == NULL && sk->ack_backlog == 0)
1529 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1530 }
1531 else
1532 {
1533 /*
1534 * This is going straight out
1535 */
1536
1537 th->ack_seq = htonl(sk->acked_seq);
1538 th->window = htons(tcp_select_window(sk));
1539
1540 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1541
1542 sk->sent_seq = sk->write_seq;
1543
1544 /*
1545 * This is mad. The tcp retransmit queue is put together
1546 * by the ip layer. This causes half the problems with
1547 * unroutable FIN's and other things.
1548 */
1549
1550 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1551
1552
1553 sk->ack_backlog = 0;
1554 sk->bytes_rcv = 0;
1555
1556 /*
1557 * Set for next retransmit based on expected ACK time.
1558 * FIXME: We set this every time which means our
1559 * retransmits are really about a window behind.
1560 */
1561
1562 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1563 }
1564 }
1565
1566 /*
1567 * Locking problems lead us to a messy situation where we can have
1568 * multiple partially complete buffers queued up. This is really bad
1569 * as we don't want to be sending partial buffers. Fix this with
1570 * a semaphore or similar to lock tcp_write per socket.
1571 *
1572 * These routines are pretty self descriptive.
1573 */
1574
1575 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1576 {
1577 struct sk_buff * skb;
1578 unsigned long flags;
1579
1580 save_flags(flags);
1581 cli();
1582 skb = sk->partial;
1583 if (skb) {
1584 sk->partial = NULL;
1585 del_timer(&sk->partial_timer);
1586 }
1587 restore_flags(flags);
1588 return skb;
1589 }
1590
1591 /*
1592 * Empty the partial queue
1593 */
1594
1595 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1596 {
1597 struct sk_buff *skb;
1598
1599 if (sk == NULL)
1600 return;
1601 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1602 tcp_send_skb(sk, skb);
1603 }
1604
1605 /*
1606 * Queue a partial frame
1607 */
1608
1609 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1610 {
1611 struct sk_buff * tmp;
1612 unsigned long flags;
1613
1614 save_flags(flags);
1615 cli();
1616 tmp = sk->partial;
1617 if (tmp)
1618 del_timer(&sk->partial_timer);
1619 sk->partial = skb;
1620 init_timer(&sk->partial_timer);
1621 /*
1622 * Wait up to 1 second for the buffer to fill.
1623 */
1624 sk->partial_timer.expires = jiffies+HZ;
1625 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1626 sk->partial_timer.data = (unsigned long) sk;
1627 add_timer(&sk->partial_timer);
1628 restore_flags(flags);
1629 if (tmp)
1630 tcp_send_skb(sk, tmp);
1631 }
1632
1633
1634
1635 /*
1636 * This routine sends an ack and also updates the window.
1637 */
1638
1639 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1640 struct sock *sk,
1641 struct tcphdr *th, unsigned long daddr)
1642 {
1643 struct sk_buff *buff;
1644 struct tcphdr *t1;
1645 struct device *dev = NULL;
1646 int tmp;
1647
1648 if(sk->zapped)
1649 return; /* We have been reset, we may not send again */
1650
1651 /*
1652 * We need to grab some memory, and put together an ack,
1653 * and then put it into the queue to be sent.
1654 */
1655
1656 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1657 if (buff == NULL)
1658 {
1659 /*
1660 * Force it to send an ack. We don't have to do this
1661 * (ACK is unreliable) but it's much better use of
1662 * bandwidth on slow links to send a spare ack than
1663 * resend packets.
1664 */
1665
1666 sk->ack_backlog++;
1667 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1668 {
1669 reset_xmit_timer(sk, TIME_WRITE, HZ);
1670 }
1671 return;
1672 }
1673
1674 /*
1675 * Assemble a suitable TCP frame
1676 */
1677
1678 buff->sk = sk;
1679 buff->localroute = sk->localroute;
1680
1681 /*
1682 * Put in the IP header and routing stuff.
1683 */
1684
1685 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1686 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1687 if (tmp < 0)
1688 {
1689 buff->free = 1;
1690 sock_wfree(sk, buff);
1691 return;
1692 }
1693 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1694
1695 memcpy(t1, th, sizeof(*t1));
1696
1697 /*
1698 * Swap the send and the receive.
1699 */
1700
1701 t1->dest = th->source;
1702 t1->source = th->dest;
1703 t1->seq = ntohl(sequence);
1704 t1->ack = 1;
1705 sk->window = tcp_select_window(sk);
1706 t1->window = ntohs(sk->window);
1707 t1->res1 = 0;
1708 t1->res2 = 0;
1709 t1->rst = 0;
1710 t1->urg = 0;
1711 t1->syn = 0;
1712 t1->psh = 0;
1713 t1->fin = 0;
1714
1715 /*
1716 * If we have nothing queued for transmit and the transmit timer
1717 * is on we are just doing an ACK timeout and need to switch
1718 * to a keepalive.
1719 */
1720
1721 if (ack == sk->acked_seq) {
1722 sk->ack_backlog = 0;
1723 sk->bytes_rcv = 0;
1724 sk->ack_timed = 0;
1725
1726 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1727 && sk->ip_xmit_timeout == TIME_WRITE)
1728 if(sk->keepopen)
1729 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1730 else
1731 delete_timer(sk);
1732 }
1733
1734 /*
1735 * Fill in the packet and send it
1736 */
1737
1738 t1->ack_seq = htonl(ack);
1739 t1->doff = sizeof(*t1)/4;
1740 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1741 if (sk->debug)
1742 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1743 tcp_statistics.TcpOutSegs++;
1744 sk->prot->queue_xmit(sk, dev, buff, 1);
1745 }
1746
1747
1748 /*
1749 * This routine builds a generic TCP header.
1750 */
1751
1752 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1753 {
1754
1755 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1756 th->seq = htonl(sk->write_seq);
1757 th->psh =(push == 0) ? 1 : 0;
1758 th->doff = sizeof(*th)/4;
1759 th->ack = 1;
1760 th->fin = 0;
1761 sk->ack_backlog = 0;
1762 sk->bytes_rcv = 0;
1763 sk->ack_timed = 0;
1764 th->ack_seq = htonl(sk->acked_seq);
1765 sk->window = tcp_select_window(sk);
1766 th->window = htons(sk->window);
1767
1768 return(sizeof(*th));
1769 }
1770
1771 /*
1772 * This routine copies from a user buffer into a socket,
1773 * and starts the transmit system.
1774 */
1775
1776 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1777 int len, int nonblock, int flags)
1778 {
1779 int copied = 0;
1780 int copy;
1781 int tmp;
1782 int seglen;
1783 int iovct=0;
1784 struct sk_buff *skb;
1785 struct sk_buff *send_tmp;
1786 struct proto *prot;
1787 struct device *dev = NULL;
1788 unsigned char *from;
1789
1790 /*
1791 * Do sanity checking for sendmsg/sendto/send
1792 */
1793
1794 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1795 return -EINVAL;
1796 if (msg->msg_name)
1797 {
1798 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1799 if(sk->state == TCP_CLOSE)
1800 return -ENOTCONN;
1801 if (msg->msg_namelen < sizeof(*addr))
1802 return -EINVAL;
1803 if (addr->sin_family && addr->sin_family != AF_INET)
1804 return -EINVAL;
1805 if (addr->sin_port != sk->dummy_th.dest)
1806 return -EISCONN;
1807 if (addr->sin_addr.s_addr != sk->daddr)
1808 return -EISCONN;
1809 }
1810
1811 /*
1812 * Ok commence sending
1813 */
1814
1815 while(iovct<msg->msg_iovlen)
1816 {
1817 seglen=msg->msg_iov[iovct].iov_len;
1818 from=msg->msg_iov[iovct++].iov_base;
1819 sk->inuse=1;
1820 prot = sk->prot;
1821 while(seglen > 0)
1822 {
1823 if (sk->err)
1824 { /* Stop on an error */
1825 release_sock(sk);
1826 if (copied)
1827 return(copied);
1828 return sock_error(sk);
1829 }
1830
1831 /*
1832 * First thing we do is make sure that we are established.
1833 */
1834
1835 if (sk->shutdown & SEND_SHUTDOWN)
1836 {
1837 release_sock(sk);
1838 sk->err = EPIPE;
1839 if (copied)
1840 return(copied);
1841 sk->err = 0;
1842 return(-EPIPE);
1843 }
1844
1845 /*
1846 * Wait for a connection to finish.
1847 */
1848
1849 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1850 {
1851 if (sk->err)
1852 {
1853 release_sock(sk);
1854 if (copied)
1855 return(copied);
1856 return sock_error(sk);
1857 }
1858
1859 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1860 {
1861 release_sock(sk);
1862 if (copied)
1863 return(copied);
1864
1865 if (sk->err)
1866 return sock_error(sk);
1867
1868 if (sk->keepopen)
1869 {
1870 send_sig(SIGPIPE, current, 0);
1871 }
1872 return(-EPIPE);
1873 }
1874
1875 if (nonblock || copied)
1876 {
1877 release_sock(sk);
1878 if (copied)
1879 return(copied);
1880 return(-EAGAIN);
1881 }
1882
1883 release_sock(sk);
1884 cli();
1885
1886 if (sk->state != TCP_ESTABLISHED &&
1887 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1888 {
1889 interruptible_sleep_on(sk->sleep);
1890 if (current->signal & ~current->blocked)
1891 {
1892 sti();
1893 if (copied)
1894 return(copied);
1895 return(-ERESTARTSYS);
1896 }
1897 }
1898 sk->inuse = 1;
1899 sti();
1900 }
1901
1902 /*
1903 * The following code can result in copy <= if sk->mss is ever
1904 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1905 * sk->mtu is constant once SYN processing is finished. I.e. we
1906 * had better not get here until we've seen his SYN and at least one
1907 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1908 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1909 * non-decreasing. Note that any ioctl to set user_mss must be done
1910 * before the exchange of SYN's. If the initial ack from the other
1911 * end has a window of 0, max_window and thus mss will both be 0.
1912 */
1913
1914 /*
1915 * Now we need to check if we have a half built packet.
1916 */
1917 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1918 /*
1919 * FIXME: I'm almost sure that this fragment is BUG,
1920 * but it works... I do not know why 8) --ANK
1921 *
1922 * Really, we should rebuild all the queues...
1923 * It's difficult. Temprorary hack is to send all
1924 * queued segments with allowed fragmentation.
1925 */
1926 {
1927 int new_mss = min(sk->mtu, sk->max_window);
1928 if (new_mss < sk->mss)
1929 {
1930 tcp_send_partial(sk);
1931 sk->mss = new_mss;
1932 }
1933 }
1934 #endif
1935
1936 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1937 {
1938 int hdrlen;
1939
1940 /* IP header + TCP header */
1941 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1942 + sizeof(struct tcphdr);
1943
1944 /* Add more stuff to the end of skb->len */
1945 if (!(flags & MSG_OOB))
1946 {
1947 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1948 if (copy <= 0)
1949 {
1950 printk("TCP: **bug**: \"copy\" <= 0\n");
1951 return -EFAULT;
1952 }
1953 memcpy_fromfs(skb_put(skb,copy), from, copy);
1954 from += copy;
1955 copied += copy;
1956 len -= copy;
1957 sk->write_seq += copy;
1958 seglen -= copy;
1959 }
1960 if ((skb->len - hdrlen) >= sk->mss ||
1961 (flags & MSG_OOB) || !sk->packets_out)
1962 tcp_send_skb(sk, skb);
1963 else
1964 tcp_enqueue_partial(skb, sk);
1965 continue;
1966 }
1967
1968 /*
1969 * We also need to worry about the window.
1970 * If window < 1/2 the maximum window we've seen from this
1971 * host, don't use it. This is sender side
1972 * silly window prevention, as specified in RFC1122.
1973 * (Note that this is different than earlier versions of
1974 * SWS prevention, e.g. RFC813.). What we actually do is
1975 * use the whole MSS. Since the results in the right
1976 * edge of the packet being outside the window, it will
1977 * be queued for later rather than sent.
1978 */
1979
1980 copy = sk->window_seq - sk->write_seq;
1981 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1982 copy = sk->mss;
1983 if (copy > seglen)
1984 copy = seglen;
1985
1986 /*
1987 * We should really check the window here also.
1988 */
1989
1990 send_tmp = NULL;
1991 if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1992 {
1993 /*
1994 * We will release the socket in case we sleep here.
1995 */
1996 release_sock(sk);
1997 /*
1998 * NB: following must be mtu, because mss can be increased.
1999 * mss is always <= mtu
2000 */
2001 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
2002 sk->inuse = 1;
2003 send_tmp = skb;
2004 }
2005 else
2006 {
2007 /*
2008 * We will release the socket in case we sleep here.
2009 */
2010 release_sock(sk);
2011 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
2012 sk->inuse = 1;
2013 }
2014
2015 /*
2016 * If we didn't get any memory, we need to sleep.
2017 */
2018
2019 if (skb == NULL)
2020 {
2021 sk->socket->flags |= SO_NOSPACE;
2022 if (nonblock)
2023 {
2024 release_sock(sk);
2025 if (copied)
2026 return(copied);
2027 return(-EAGAIN);
2028 }
2029
2030 /*
2031 * FIXME: here is another race condition.
2032 */
2033
2034 tmp = sk->wmem_alloc;
2035 release_sock(sk);
2036 cli();
2037 /*
2038 * Again we will try to avoid it.
2039 */
2040 if (tmp <= sk->wmem_alloc &&
2041 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
2042 && sk->err == 0)
2043 {
2044 sk->socket->flags &= ~SO_NOSPACE;
2045 interruptible_sleep_on(sk->sleep);
2046 if (current->signal & ~current->blocked)
2047 {
2048 sti();
2049 if (copied)
2050 return(copied);
2051 return(-ERESTARTSYS);
2052 }
2053 }
2054 sk->inuse = 1;
2055 sti();
2056 continue;
2057 }
2058
2059 skb->sk = sk;
2060 skb->free = 0;
2061 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
2062
2063 /*
2064 * FIXME: we need to optimize this.
2065 * Perhaps some hints here would be good.
2066 */
2067
2068 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
2069 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2070 if (tmp < 0 )
2071 {
2072 sock_wfree(sk, skb);
2073 release_sock(sk);
2074 if (copied)
2075 return(copied);
2076 return(tmp);
2077 }
2078 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
2079 skb->ip_hdr->frag_off |= htons(IP_DF);
2080 #endif
2081 skb->dev = dev;
2082 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
2083 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
2084 if (tmp < 0)
2085 {
2086 sock_wfree(sk, skb);
2087 release_sock(sk);
2088 if (copied)
2089 return(copied);
2090 return(tmp);
2091 }
2092
2093 if (flags & MSG_OOB)
2094 {
2095 skb->h.th->urg = 1;
2096 skb->h.th->urg_ptr = ntohs(copy);
2097 }
2098
2099 memcpy_fromfs(skb_put(skb,copy), from, copy);
2100
2101 from += copy;
2102 copied += copy;
2103 len -= copy;
2104 seglen -= copy;
2105 skb->free = 0;
2106 sk->write_seq += copy;
2107
2108 if (send_tmp != NULL)
2109 {
2110 tcp_enqueue_partial(send_tmp, sk);
2111 continue;
2112 }
2113 tcp_send_skb(sk, skb);
2114 }
2115 }
2116 sk->err = 0;
2117
2118 /*
2119 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2120 * interactive fast network servers. It's meant to be on and
2121 * it really improves the throughput though not the echo time
2122 * on my slow slip link - Alan
2123 */
2124
2125 /*
2126 * Avoid possible race on send_tmp - c/o Johannes Stille
2127 */
2128
2129 if(sk->partial && ((!sk->packets_out)
2130 /* If not nagling we can send on the before case too.. */
2131 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2132 ))
2133 tcp_send_partial(sk);
2134
2135 release_sock(sk);
2136 return(copied);
2137 }
2138
2139 /*
2140 * Send an ack if one is backlogged at this point. Ought to merge
2141 * this with tcp_send_ack().
2142 * This is called for delayed acks also.
2143 */
2144
2145 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2146 {
2147 int tmp;
2148 struct device *dev = NULL;
2149 struct tcphdr *t1;
2150 struct sk_buff *buff;
2151
2152 if (!sk->ack_backlog)
2153 return;
2154
2155 /*
2156 * If we're closed, don't send an ack, or we'll get a RST
2157 * from the closed destination.
2158 */
2159 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2160 return;
2161
2162 /*
2163 * FIXME: we need to put code here to prevent this routine from
2164 * being called. Being called once in a while is ok, so only check
2165 * if this is the second time in a row.
2166 */
2167
2168 /*
2169 * We need to grab some memory, and put together an ack,
2170 * and then put it into the queue to be sent.
2171 */
2172
2173 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2174 if (buff == NULL)
2175 {
2176 /* Try again real soon. */
2177 reset_xmit_timer(sk, TIME_WRITE, HZ);
2178 return;
2179 }
2180
2181 buff->sk = sk;
2182 buff->localroute = sk->localroute;
2183
2184 /*
2185 * Put in the IP header and routing stuff.
2186 */
2187
2188 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2189 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2190 if (tmp < 0)
2191 {
2192 buff->free = 1;
2193 sock_wfree(sk, buff);
2194 return;
2195 }
2196
2197 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2198
2199 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2200 t1->seq = htonl(sk->sent_seq);
2201 t1->ack = 1;
2202 t1->res1 = 0;
2203 t1->res2 = 0;
2204 t1->rst = 0;
2205 t1->urg = 0;
2206 t1->syn = 0;
2207 t1->psh = 0;
2208
2209
2210 sk->ack_backlog = 0;
2211 sk->bytes_rcv = 0;
2212
2213 sk->window = tcp_select_window(sk);
2214 t1->window = htons(sk->window);
2215 t1->ack_seq = htonl(sk->acked_seq);
2216 t1->doff = sizeof(*t1)/4;
2217 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2218 sk->prot->queue_xmit(sk, dev, buff, 1);
2219 tcp_statistics.TcpOutSegs++;
2220 }
2221
2222
2223 /*
2224 * FIXME:
2225 * This routine frees used buffers.
2226 * It should consider sending an ACK to let the
2227 * other end know we now have a bigger window.
2228 */
2229
2230 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2231 {
2232 unsigned long flags;
2233 struct sk_buff *skb;
2234 unsigned long rspace;
2235
2236 save_flags(flags);
2237 cli();
2238
2239 /*
2240 * See if we have anything to free up?
2241 */
2242
2243 skb = skb_peek(&sk->receive_queue);
2244 if (!skb || !skb->used || skb->users) {
2245 restore_flags(flags);
2246 return;
2247 }
2248
2249 /*
2250 * We have to loop through all the buffer headers,
2251 * and try to free up all the space we can.
2252 */
2253
2254 do {
2255 skb_unlink(skb);
2256 skb->sk = sk;
2257 kfree_skb(skb, FREE_READ);
2258 skb = skb_peek(&sk->receive_queue);
2259 } while (skb && skb->used && !skb->users);
2260 restore_flags(flags);
2261
2262 /*
2263 * FIXME:
2264 * At this point we should send an ack if the difference
2265 * in the window, and the amount of space is bigger than
2266 * TCP_WINDOW_DIFF.
2267 */
2268
2269 rspace=sock_rspace(sk);
2270 if(sk->debug)
2271 printk("sk->rspace = %lu\n", rspace);
2272 /*
2273 * This area has caused the most trouble. The current strategy
2274 * is to simply do nothing if the other end has room to send at
2275 * least 3 full packets, because the ack from those will auto-
2276 * matically update the window. If the other end doesn't think
2277 * we have much space left, but we have room for at least 1 more
2278 * complete packet than it thinks we do, we will send an ack
2279 * immediately. Otherwise we will wait up to .5 seconds in case
2280 * the user reads some more.
2281 */
2282 sk->ack_backlog++;
2283
2284 /*
2285 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
2286 * if the other end is offering a window smaller than the agreed on MSS
2287 * (called sk->mtu here). In theory there's no connection between send
2288 * and receive, and so no reason to think that they're going to send
2289 * small packets. For the moment I'm using the hack of reducing the mss
2290 * only on the send side, so I'm putting mtu here.
2291 */
2292
2293 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2294 {
2295 /* Send an ack right now. */
2296 tcp_read_wakeup(sk);
2297 }
2298 else
2299 {
2300 /* Force it to send an ack soon. */
2301 int was_active = del_timer(&sk->retransmit_timer);
2302 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2303 {
2304 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2305 }
2306 else
2307 add_timer(&sk->retransmit_timer);
2308 }
2309 }
2310
2311
2312 /*
2313 * Handle reading urgent data. BSD has very simple semantics for
2314 * this, no blocking and very strange errors 8)
2315 */
2316
2317 static int tcp_recv_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2318 struct msghdr *msg, int len, int flags, int *addr_len)
2319 {
2320 /*
2321 * No URG data to read
2322 */
2323 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2324 return -EINVAL; /* Yes this is right ! */
2325
2326 if (sk->err)
2327 return sock_error(sk);
2328
2329 if (sk->state == TCP_CLOSE || sk->done)
2330 {
2331 if (!sk->done)
2332 {
2333 sk->done = 1;
2334 return 0;
2335 }
2336 return -ENOTCONN;
2337 }
2338
2339 if (sk->shutdown & RCV_SHUTDOWN)
2340 {
2341 sk->done = 1;
2342 return 0;
2343 }
2344 sk->inuse = 1;
2345 if (sk->urg_data & URG_VALID)
2346 {
2347 char c = sk->urg_data;
2348 if (!(flags & MSG_PEEK))
2349 sk->urg_data = URG_READ;
2350 memcpy_toiovec(msg->msg_iov, &c, 1);
2351 if(msg->msg_name)
2352 {
2353 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2354 sin->sin_family=AF_INET;
2355 sin->sin_addr.s_addr=sk->daddr;
2356 sin->sin_port=sk->dummy_th.dest;
2357 }
2358 if(addr_len)
2359 *addr_len=sizeof(struct sockaddr_in);
2360 release_sock(sk);
2361 return 1;
2362 }
2363 release_sock(sk);
2364
2365 /*
2366 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
2367 * the available implementations agree in this case:
2368 * this call should never block, independent of the
2369 * blocking state of the socket.
2370 * Mike <pall@rz.uni-karlsruhe.de>
2371 */
2372 return -EAGAIN;
2373 }
2374
2375
2376 /*
2377 * This routine copies from a sock struct into the user buffer.
2378 */
2379
2380 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2381 int len, int nonblock, int flags, int *addr_len)
2382 {
2383 struct wait_queue wait = { current, NULL };
2384 int copied = 0;
2385 u32 peek_seq;
2386 volatile u32 *seq; /* So gcc doesn't overoptimise */
2387 unsigned long used;
2388
2389 /*
2390 * This error should be checked.
2391 */
2392
2393 if (sk->state == TCP_LISTEN)
2394 return -ENOTCONN;
2395
2396 /*
2397 * Urgent data needs to be handled specially.
2398 */
2399
2400 if (flags & MSG_OOB)
2401 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2402
2403 /*
2404 * Copying sequence to update. This is volatile to handle
2405 * the multi-reader case neatly (memcpy_to/fromfs might be
2406 * inline and thus not flush cached variables otherwise).
2407 */
2408
2409 peek_seq = sk->copied_seq;
2410 seq = &sk->copied_seq;
2411 if (flags & MSG_PEEK)
2412 seq = &peek_seq;
2413
2414 add_wait_queue(sk->sleep, &wait);
2415 sk->inuse = 1;
2416 while (len > 0)
2417 {
2418 struct sk_buff * skb;
2419 u32 offset;
2420
2421 /*
2422 * Are we at urgent data? Stop if we have read anything.
2423 */
2424
2425 if (copied && sk->urg_data && sk->urg_seq == *seq)
2426 break;
2427
2428 /*
2429 * Next get a buffer.
2430 */
2431
2432 current->state = TASK_INTERRUPTIBLE;
2433
2434 skb = skb_peek(&sk->receive_queue);
2435 do
2436 {
2437 if (!skb)
2438 break;
2439 if (before(*seq, skb->seq))
2440 break;
2441 offset = *seq - skb->seq;
2442 if (skb->h.th->syn)
2443 offset--;
2444 if (offset < skb->len)
2445 goto found_ok_skb;
2446 if (skb->h.th->fin)
2447 goto found_fin_ok;
2448 if (!(flags & MSG_PEEK))
2449 skb->used = 1;
2450 skb = skb->next;
2451 }
2452 while (skb != (struct sk_buff *)&sk->receive_queue);
2453
2454 if (copied)
2455 break;
2456
2457 if (sk->err)
2458 {
2459 copied = sock_error(sk);
2460 break;
2461 }
2462
2463 if (sk->state == TCP_CLOSE)
2464 {
2465 if (!sk->done)
2466 {
2467 sk->done = 1;
2468 break;
2469 }
2470 copied = -ENOTCONN;
2471 break;
2472 }
2473
2474 if (sk->shutdown & RCV_SHUTDOWN)
2475 {
2476 sk->done = 1;
2477 break;
2478 }
2479
2480 if (nonblock)
2481 {
2482 copied = -EAGAIN;
2483 break;
2484 }
2485
2486 cleanup_rbuf(sk);
2487 release_sock(sk);
2488 sk->socket->flags |= SO_WAITDATA;
2489 schedule();
2490 sk->socket->flags &= ~SO_WAITDATA;
2491 sk->inuse = 1;
2492
2493 if (current->signal & ~current->blocked)
2494 {
2495 copied = -ERESTARTSYS;
2496 break;
2497 }
2498 continue;
2499
2500 found_ok_skb:
2501 /*
2502 * Lock the buffer. We can be fairly relaxed as
2503 * an interrupt will never steal a buffer we are
2504 * using unless I've missed something serious in
2505 * tcp_data.
2506 */
2507
2508 skb->users++;
2509
2510 /*
2511 * Ok so how much can we use ?
2512 */
2513
2514 used = skb->len - offset;
2515 if (len < used)
2516 used = len;
2517 /*
2518 * Do we have urgent data here?
2519 */
2520
2521 if (sk->urg_data)
2522 {
2523 u32 urg_offset = sk->urg_seq - *seq;
2524 if (urg_offset < used)
2525 {
2526 if (!urg_offset)
2527 {
2528 if (!sk->urginline)
2529 {
2530 ++*seq;
2531 offset++;
2532 used--;
2533 }
2534 }
2535 else
2536 used = urg_offset;
2537 }
2538 }
2539
2540 /*
2541 * Copy it - We _MUST_ update *seq first so that we
2542 * don't ever double read when we have dual readers
2543 */
2544
2545 *seq += used;
2546
2547 /*
2548 * This memcpy_tofs can sleep. If it sleeps and we
2549 * do a second read it relies on the skb->users to avoid
2550 * a crash when cleanup_rbuf() gets called.
2551 */
2552
2553 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2554 skb->h.th->doff*4 + offset, used);
2555 copied += used;
2556 len -= used;
2557
2558 /*
2559 * We now will not sleep again until we are finished
2560 * with skb. Sorry if you are doing the SMP port
2561 * but you'll just have to fix it neatly ;)
2562 */
2563
2564 skb->users --;
2565
2566 if (after(sk->copied_seq,sk->urg_seq))
2567 sk->urg_data = 0;
2568 if (used + offset < skb->len)
2569 continue;
2570
2571 /*
2572 * Process the FIN.
2573 */
2574
2575 if (skb->h.th->fin)
2576 goto found_fin_ok;
2577 if (flags & MSG_PEEK)
2578 continue;
2579 skb->used = 1;
2580 continue;
2581
2582 found_fin_ok:
2583 ++*seq;
2584 if (flags & MSG_PEEK)
2585 break;
2586
2587 /*
2588 * All is done
2589 */
2590
2591 skb->used = 1;
2592 sk->shutdown |= RCV_SHUTDOWN;
2593 break;
2594
2595 }
2596
2597 if(copied>0 && msg->msg_name)
2598 {
2599 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2600 sin->sin_family=AF_INET;
2601 sin->sin_addr.s_addr=sk->daddr;
2602 sin->sin_port=sk->dummy_th.dest;
2603 }
2604 if(addr_len)
2605 *addr_len=sizeof(struct sockaddr_in);
2606
2607 remove_wait_queue(sk->sleep, &wait);
2608 current->state = TASK_RUNNING;
2609
2610 /* Clean up data we have read: This will do ACK frames */
2611 cleanup_rbuf(sk);
2612 release_sock(sk);
2613 return copied;
2614 }
2615
2616
2617
2618 /*
2619 * State processing on a close. This implements the state shift for
2620 * sending our FIN frame. Note that we only send a FIN for some
2621 * states. A shutdown() may have already sent the FIN, or we may be
2622 * closed.
2623 */
2624
2625 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2626 {
2627 int ns=TCP_CLOSE;
2628 int send_fin=0;
2629 switch(sk->state)
2630 {
2631 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2632 break;
2633 case TCP_SYN_RECV:
2634 case TCP_ESTABLISHED: /* Closedown begin */
2635 ns=TCP_FIN_WAIT1;
2636 send_fin=1;
2637 break;
2638 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2639 case TCP_FIN_WAIT2:
2640 case TCP_CLOSING:
2641 ns=sk->state;
2642 break;
2643 case TCP_CLOSE:
2644 case TCP_LISTEN:
2645 break;
2646 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2647 wait only for the ACK */
2648 ns=TCP_LAST_ACK;
2649 send_fin=1;
2650 }
2651
2652 tcp_set_state(sk,ns);
2653
2654 /*
2655 * This is a (useful) BSD violating of the RFC. There is a
2656 * problem with TCP as specified in that the other end could
2657 * keep a socket open forever with no application left this end.
2658 * We use a 3 minute timeout (about the same as BSD) then kill
2659 * our end. If they send after that then tough - BUT: long enough
2660 * that we won't make the old 4*rto = almost no time - whoops
2661 * reset mistake.
2662 */
2663 if(dead && ns==TCP_FIN_WAIT2)
2664 {
2665 int timer_active=del_timer(&sk->timer);
2666 if(timer_active)
2667 add_timer(&sk->timer);
2668 else
2669 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2670 }
2671
2672 return send_fin;
2673 }
2674
2675 /*
2676 * Send a fin.
2677 */
2678
2679 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2680 {
2681 struct proto *prot =(struct proto *)sk->prot;
2682 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2683 struct tcphdr *t1;
2684 struct sk_buff *buff;
2685 struct device *dev=NULL;
2686 int tmp;
2687
2688 release_sock(sk); /* in case the malloc sleeps. */
2689
2690 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2691 sk->inuse = 1;
2692
2693 if (buff == NULL)
2694 {
2695 /* This is a disaster if it occurs */
2696 printk("tcp_send_fin: Impossible malloc failure");
2697 return;
2698 }
2699
2700 /*
2701 * Administrivia
2702 */
2703
2704 buff->sk = sk;
2705 buff->localroute = sk->localroute;
2706
2707 /*
2708 * Put in the IP header and routing stuff.
2709 */
2710
2711 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2712 IPPROTO_TCP, sk->opt,
2713 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2714 if (tmp < 0)
2715 {
2716 int t;
2717 /*
2718 * Finish anyway, treat this as a send that got lost.
2719 * (Not good).
2720 */
2721
2722 buff->free = 1;
2723 sock_wfree(sk,buff);
2724 sk->write_seq++;
2725 t=del_timer(&sk->timer);
2726 if(t)
2727 add_timer(&sk->timer);
2728 else
2729 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2730 return;
2731 }
2732
2733 /*
2734 * We ought to check if the end of the queue is a buffer and
2735 * if so simply add the fin to that buffer, not send it ahead.
2736 */
2737
2738 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2739 buff->dev = dev;
2740 memcpy(t1, th, sizeof(*t1));
2741 buff->seq = sk->write_seq;
2742 sk->write_seq++;
2743 buff->end_seq = sk->write_seq;
2744 t1->seq = htonl(buff->seq);
2745 t1->ack = 1;
2746 t1->ack_seq = htonl(sk->acked_seq);
2747 t1->window = htons(sk->window=tcp_select_window(sk));
2748 t1->fin = 1;
2749 t1->rst = 0;
2750 t1->doff = sizeof(*t1)/4;
2751 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2752
2753 /*
2754 * If there is data in the write queue, the fin must be appended to
2755 * the write queue.
2756 */
2757
2758 if (skb_peek(&sk->write_queue) != NULL)
2759 {
2760 buff->free = 0;
2761 if (buff->next != NULL)
2762 {
2763 printk("tcp_send_fin: next != NULL\n");
2764 skb_unlink(buff);
2765 }
2766 skb_queue_tail(&sk->write_queue, buff);
2767 }
2768 else
2769 {
2770 sk->sent_seq = sk->write_seq;
2771 sk->prot->queue_xmit(sk, dev, buff, 0);
2772 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2773 }
2774 }
2775
2776 /*
2777 * Shutdown the sending side of a connection. Much like close except
2778 * that we don't receive shut down or set sk->dead=1.
2779 */
2780
2781 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2782 {
2783 /*
2784 * We need to grab some memory, and put together a FIN,
2785 * and then put it into the queue to be sent.
2786 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2787 */
2788
2789 if (!(how & SEND_SHUTDOWN))
2790 return;
2791
2792 /*
2793 * If we've already sent a FIN, or it's a closed state
2794 */
2795
2796 if (sk->state == TCP_FIN_WAIT1 ||
2797 sk->state == TCP_FIN_WAIT2 ||
2798 sk->state == TCP_CLOSING ||
2799 sk->state == TCP_LAST_ACK ||
2800 sk->state == TCP_TIME_WAIT ||
2801 sk->state == TCP_CLOSE ||
2802 sk->state == TCP_LISTEN
2803 )
2804 {
2805 return;
2806 }
2807 sk->inuse = 1;
2808
2809 /*
2810 * flag that the sender has shutdown
2811 */
2812
2813 sk->shutdown |= SEND_SHUTDOWN;
2814
2815 /*
2816 * Clear out any half completed packets.
2817 */
2818
2819 if (sk->partial)
2820 tcp_send_partial(sk);
2821
2822 /*
2823 * FIN if needed
2824 */
2825
2826 if(tcp_close_state(sk,0))
2827 tcp_send_fin(sk);
2828
2829 release_sock(sk);
2830 }
2831
2832 /*
2833 * This routine will send an RST to the other tcp.
2834 */
2835
2836 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2837 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2838 {
2839 struct sk_buff *buff;
2840 struct tcphdr *t1;
2841 int tmp;
2842 struct device *ndev=NULL;
2843
2844 /*
2845 * Cannot reset a reset (Think about it).
2846 */
2847
2848 if(th->rst)
2849 return;
2850
2851 /*
2852 * We need to grab some memory, and put together an RST,
2853 * and then put it into the queue to be sent.
2854 */
2855
2856 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2857 if (buff == NULL)
2858 return;
2859
2860 buff->sk = NULL;
2861 buff->dev = dev;
2862 buff->localroute = 0;
2863
2864 /*
2865 * Put in the IP header and routing stuff.
2866 */
2867
2868 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2869 sizeof(struct tcphdr),tos,ttl,NULL);
2870 if (tmp < 0)
2871 {
2872 buff->free = 1;
2873 sock_wfree(NULL, buff);
2874 return;
2875 }
2876
2877 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2878 memcpy(t1, th, sizeof(*t1));
2879
2880 /*
2881 * Swap the send and the receive.
2882 */
2883
2884 t1->dest = th->source;
2885 t1->source = th->dest;
2886 t1->rst = 1;
2887 t1->window = 0;
2888
2889 if(th->ack)
2890 {
2891 t1->ack = 0;
2892 t1->seq = th->ack_seq;
2893 t1->ack_seq = 0;
2894 }
2895 else
2896 {
2897 t1->ack = 1;
2898 if(!th->syn)
2899 t1->ack_seq = th->seq;
2900 else
2901 t1->ack_seq = htonl(ntohl(th->seq)+1);
2902 t1->seq = 0;
2903 }
2904
2905 t1->syn = 0;
2906 t1->urg = 0;
2907 t1->fin = 0;
2908 t1->psh = 0;
2909 t1->doff = sizeof(*t1)/4;
2910 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2911 prot->queue_xmit(NULL, ndev, buff, 1);
2912 tcp_statistics.TcpOutSegs++;
2913 }
2914
2915
2916 /*
2917 * Look for tcp options. Parses everything but only knows about MSS.
2918 * This routine is always called with the packet containing the SYN.
2919 * However it may also be called with the ack to the SYN. So you
2920 * can't assume this is always the SYN. It's always called after
2921 * we have set up sk->mtu to our own MTU.
2922 *
2923 * We need at minimum to add PAWS support here. Possibly large windows
2924 * as Linux gets deployed on 100Mb/sec networks.
2925 */
2926
2927 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2928 {
2929 unsigned char *ptr;
2930 int length=(th->doff*4)-sizeof(struct tcphdr);
2931 int mss_seen = 0;
2932
2933 ptr = (unsigned char *)(th + 1);
2934
2935 while(length>0)
2936 {
2937 int opcode=*ptr++;
2938 int opsize=*ptr++;
2939 switch(opcode)
2940 {
2941 case TCPOPT_EOL:
2942 return;
2943 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2944 length--;
2945 ptr--; /* the opsize=*ptr++ above was a mistake */
2946 continue;
2947
2948 default:
2949 if(opsize<=2) /* Avoid silly options looping forever */
2950 return;
2951 switch(opcode)
2952 {
2953 case TCPOPT_MSS:
2954 if(opsize==4 && th->syn)
2955 {
2956 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2957 mss_seen = 1;
2958 }
2959 break;
2960 /* Add other options here as people feel the urge to implement stuff like large windows */
2961 }
2962 ptr+=opsize-2;
2963 length-=opsize;
2964 }
2965 }
2966 if (th->syn)
2967 {
2968 if (! mss_seen)
2969 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2970 }
2971 #ifdef CONFIG_INET_PCTCP
2972 sk->mss = min(sk->max_window >> 1, sk->mtu);
2973 #else
2974 sk->mss = min(sk->max_window, sk->mtu);
2975 sk->max_unacked = 2 * sk->mss;
2976 #endif
2977 }
2978
2979 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2980 {
2981 dst = ntohl(dst);
2982 if (IN_CLASSA(dst))
2983 return htonl(IN_CLASSA_NET);
2984 if (IN_CLASSB(dst))
2985 return htonl(IN_CLASSB_NET);
2986 return htonl(IN_CLASSC_NET);
2987 }
2988
2989 /*
2990 * Default sequence number picking algorithm.
2991 * As close as possible to RFC 793, which
2992 * suggests using a 250kHz clock.
2993 * Further reading shows this assumes 2MB/s networks.
2994 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2995 * That's funny, Linux has one built in! Use it!
2996 */
2997
2998 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2999 {
3000 struct timeval tv;
3001 do_gettimeofday(&tv);
3002 return tv.tv_usec+tv.tv_sec*1000000;
3003 }
3004
3005 /*
3006 * This routine handles a connection request.
3007 * It should make sure we haven't already responded.
3008 * Because of the way BSD works, we have to send a syn/ack now.
3009 * This also means it will be harder to close a socket which is
3010 * listening.
3011 */
3012
3013 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3014 unsigned long daddr, unsigned long saddr,
3015 struct options *opt, struct device *dev, u32 seq)
3016 {
3017 struct sk_buff *buff;
3018 struct tcphdr *t1;
3019 unsigned char *ptr;
3020 struct sock *newsk;
3021 struct tcphdr *th;
3022 struct device *ndev=NULL;
3023 int tmp;
3024 struct rtable *rt;
3025
3026 th = skb->h.th;
3027
3028 /* If the socket is dead, don't accept the connection. */
3029 if (!sk->dead)
3030 {
3031 sk->data_ready(sk,0);
3032 }
3033 else
3034 {
3035 if(sk->debug)
3036 printk("Reset on %p: Connect on dead socket.\n",sk);
3037 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
3038 tcp_statistics.TcpAttemptFails++;
3039 kfree_skb(skb, FREE_READ);
3040 return;
3041 }
3042
3043 /*
3044 * Make sure we can accept more. This will prevent a
3045 * flurry of syns from eating up all our memory.
3046 */
3047
3048 if (sk->ack_backlog >= sk->max_ack_backlog)
3049 {
3050 tcp_statistics.TcpAttemptFails++;
3051 kfree_skb(skb, FREE_READ);
3052 return;
3053 }
3054
3055 /*
3056 * We need to build a new sock struct.
3057 * It is sort of bad to have a socket without an inode attached
3058 * to it, but the wake_up's will just wake up the listening socket,
3059 * and if the listening socket is destroyed before this is taken
3060 * off of the queue, this will take care of it.
3061 */
3062
3063 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
3064 if (newsk == NULL)
3065 {
3066 /* just ignore the syn. It will get retransmitted. */
3067 tcp_statistics.TcpAttemptFails++;
3068 kfree_skb(skb, FREE_READ);
3069 return;
3070 }
3071
3072 memcpy(newsk, sk, sizeof(*newsk));
3073 newsk->opt = NULL;
3074 newsk->ip_route_cache = NULL;
3075 if (opt && opt->optlen) {
3076 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
3077 if (!sk->opt) {
3078 kfree_s(newsk, sizeof(struct sock));
3079 tcp_statistics.TcpAttemptFails++;
3080 kfree_skb(skb, FREE_READ);
3081 return;
3082 }
3083 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
3084 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
3085 kfree_s(newsk, sizeof(struct sock));
3086 tcp_statistics.TcpAttemptFails++;
3087 kfree_skb(skb, FREE_READ);
3088 return;
3089 }
3090 }
3091 skb_queue_head_init(&newsk->write_queue);
3092 skb_queue_head_init(&newsk->receive_queue);
3093 newsk->send_head = NULL;
3094 newsk->send_tail = NULL;
3095 skb_queue_head_init(&newsk->back_log);
3096 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
3097 newsk->rto = TCP_TIMEOUT_INIT;
3098 newsk->mdev = 0;
3099 newsk->max_window = 0;
3100 newsk->cong_window = 1;
3101 newsk->cong_count = 0;
3102 newsk->ssthresh = 0;
3103 newsk->backoff = 0;
3104 newsk->blog = 0;
3105 newsk->intr = 0;
3106 newsk->proc = 0;
3107 newsk->done = 0;
3108 newsk->partial = NULL;
3109 newsk->pair = NULL;
3110 newsk->wmem_alloc = 0;
3111 newsk->rmem_alloc = 0;
3112 newsk->localroute = sk->localroute;
3113
3114 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3115
3116 newsk->err = 0;
3117 newsk->shutdown = 0;
3118 newsk->ack_backlog = 0;
3119 newsk->acked_seq = skb->seq+1;
3120 newsk->lastwin_seq = skb->seq+1;
3121 newsk->delay_acks = 1;
3122 newsk->copied_seq = skb->seq+1;
3123 newsk->fin_seq = skb->seq;
3124 newsk->state = TCP_SYN_RECV;
3125 newsk->timeout = 0;
3126 newsk->ip_xmit_timeout = 0;
3127 newsk->write_seq = seq;
3128 newsk->window_seq = newsk->write_seq;
3129 newsk->rcv_ack_seq = newsk->write_seq;
3130 newsk->urg_data = 0;
3131 newsk->retransmits = 0;
3132 newsk->linger=0;
3133 newsk->destroy = 0;
3134 init_timer(&newsk->timer);
3135 newsk->timer.data = (unsigned long)newsk;
3136 newsk->timer.function = &net_timer;
3137 init_timer(&newsk->retransmit_timer);
3138 newsk->retransmit_timer.data = (unsigned long)newsk;
3139 newsk->retransmit_timer.function=&retransmit_timer;
3140 newsk->dummy_th.source = skb->h.th->dest;
3141 newsk->dummy_th.dest = skb->h.th->source;
3142
3143 /*
3144 * Swap these two, they are from our point of view.
3145 */
3146
3147 newsk->daddr = saddr;
3148 newsk->saddr = daddr;
3149 newsk->rcv_saddr = daddr;
3150
3151 put_sock(newsk->num,newsk);
3152 newsk->dummy_th.res1 = 0;
3153 newsk->dummy_th.doff = 6;
3154 newsk->dummy_th.fin = 0;
3155 newsk->dummy_th.syn = 0;
3156 newsk->dummy_th.rst = 0;
3157 newsk->dummy_th.psh = 0;
3158 newsk->dummy_th.ack = 0;
3159 newsk->dummy_th.urg = 0;
3160 newsk->dummy_th.res2 = 0;
3161 newsk->acked_seq = skb->seq + 1;
3162 newsk->copied_seq = skb->seq + 1;
3163 newsk->socket = NULL;
3164
3165 /*
3166 * Grab the ttl and tos values and use them
3167 */
3168
3169 newsk->ip_ttl=sk->ip_ttl;
3170 newsk->ip_tos=skb->ip_hdr->tos;
3171
3172 /*
3173 * Use 512 or whatever user asked for
3174 */
3175
3176 /*
3177 * Note use of sk->user_mss, since user has no direct access to newsk
3178 */
3179
3180 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3181 newsk->ip_route_cache = rt;
3182
3183 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3184 newsk->window_clamp = rt->rt_window;
3185 else
3186 newsk->window_clamp = 0;
3187
3188 if (sk->user_mss)
3189 newsk->mtu = sk->user_mss;
3190 else if (rt)
3191 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3192 else
3193 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3194
3195 /*
3196 * But not bigger than device MTU
3197 */
3198
3199 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3200
3201 #ifdef CONFIG_SKIP
3202
3203 /*
3204 * SKIP devices set their MTU to 65535. This is so they can take packets
3205 * unfragmented to security process then fragment. They could lie to the
3206 * TCP layer about a suitable MTU, but its easier to let skip sort it out
3207 * simply because the final package we want unfragmented is going to be
3208 *
3209 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3210 */
3211
3212 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
3213 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3214 #endif
3215 /*
3216 * This will min with what arrived in the packet
3217 */
3218
3219 tcp_options(newsk,skb->h.th);
3220
3221 tcp_cache_zap();
3222
3223 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3224 if (buff == NULL)
3225 {
3226 sk->err = ENOMEM;
3227 newsk->dead = 1;
3228 newsk->state = TCP_CLOSE;
3229 /* And this will destroy it */
3230 release_sock(newsk);
3231 kfree_skb(skb, FREE_READ);
3232 tcp_statistics.TcpAttemptFails++;
3233 return;
3234 }
3235
3236 buff->sk = newsk;
3237 buff->localroute = newsk->localroute;
3238
3239 /*
3240 * Put in the IP header and routing stuff.
3241 */
3242
3243 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3244 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3245
3246 /*
3247 * Something went wrong.
3248 */
3249
3250 if (tmp < 0)
3251 {
3252 sk->err = tmp;
3253 buff->free = 1;
3254 kfree_skb(buff,FREE_WRITE);
3255 newsk->dead = 1;
3256 newsk->state = TCP_CLOSE;
3257 release_sock(newsk);
3258 skb->sk = sk;
3259 kfree_skb(skb, FREE_READ);
3260 tcp_statistics.TcpAttemptFails++;
3261 return;
3262 }
3263
3264 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3265
3266 memcpy(t1, skb->h.th, sizeof(*t1));
3267 buff->seq = newsk->write_seq++;
3268 buff->end_seq = newsk->write_seq;
3269 /*
3270 * Swap the send and the receive.
3271 */
3272 t1->dest = skb->h.th->source;
3273 t1->source = newsk->dummy_th.source;
3274 t1->seq = ntohl(buff->seq);
3275 t1->ack = 1;
3276 newsk->sent_seq = newsk->write_seq;
3277 t1->window = ntohs(tcp_select_window(newsk));
3278 t1->res1 = 0;
3279 t1->res2 = 0;
3280 t1->rst = 0;
3281 t1->urg = 0;
3282 t1->psh = 0;
3283 t1->syn = 1;
3284 t1->ack_seq = htonl(newsk->acked_seq);
3285 t1->doff = sizeof(*t1)/4+1;
3286 ptr = skb_put(buff,4);
3287 ptr[0] = 2;
3288 ptr[1] = 4;
3289 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3290 ptr[3] =(newsk->mtu) & 0xff;
3291
3292 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3293 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3294 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3295 skb->sk = newsk;
3296
3297 /*
3298 * Charge the sock_buff to newsk.
3299 */
3300
3301 sk->rmem_alloc -= skb->truesize;
3302 newsk->rmem_alloc += skb->truesize;
3303
3304 skb_queue_tail(&sk->receive_queue,skb);
3305 sk->ack_backlog++;
3306 release_sock(newsk);
3307 tcp_statistics.TcpOutSegs++;
3308 }
3309
3310
3311 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3312 {
3313 /*
3314 * We need to grab some memory, and put together a FIN,
3315 * and then put it into the queue to be sent.
3316 */
3317
3318 sk->inuse = 1;
3319
3320 if(th_cache_sk==sk)
3321 tcp_cache_zap();
3322 if(sk->state == TCP_LISTEN)
3323 {
3324 /* Special case */
3325 tcp_set_state(sk, TCP_CLOSE);
3326 tcp_close_pending(sk);
3327 release_sock(sk);
3328 return;
3329 }
3330
3331 sk->keepopen = 1;
3332 sk->shutdown = SHUTDOWN_MASK;
3333
3334 if (!sk->dead)
3335 sk->state_change(sk);
3336
3337 if (timeout == 0)
3338 {
3339 struct sk_buff *skb;
3340
3341 /*
3342 * We need to flush the recv. buffs. We do this only on the
3343 * descriptor close, not protocol-sourced closes, because the
3344 * reader process may not have drained the data yet!
3345 */
3346
3347 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3348 kfree_skb(skb, FREE_READ);
3349 /*
3350 * Get rid off any half-completed packets.
3351 */
3352
3353 if (sk->partial)
3354 tcp_send_partial(sk);
3355 }
3356
3357
3358 /*
3359 * Timeout is not the same thing - however the code likes
3360 * to send both the same way (sigh).
3361 */
3362
3363 if(timeout)
3364 {
3365 tcp_set_state(sk, TCP_CLOSE); /* Dead */
3366 }
3367 else
3368 {
3369 if(tcp_close_state(sk,1)==1)
3370 {
3371 tcp_send_fin(sk);
3372 }
3373 }
3374 release_sock(sk);
3375 }
3376
3377
3378 /*
3379 * This routine takes stuff off of the write queue,
3380 * and puts it in the xmit queue. This happens as incoming acks
3381 * open up the remote window for us.
3382 */
3383
3384 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3385 {
3386 struct sk_buff *skb;
3387
3388 /*
3389 * The bytes will have to remain here. In time closedown will
3390 * empty the write queue and all will be happy
3391 */
3392
3393 if(sk->zapped)
3394 return;
3395
3396 /*
3397 * Anything on the transmit queue that fits the window can
3398 * be added providing we are not
3399 *
3400 * a) retransmitting (Nagle's rule)
3401 * b) exceeding our congestion window.
3402 */
3403
3404 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3405 before(skb->end_seq, sk->window_seq + 1) &&
3406 (sk->retransmits == 0 ||
3407 sk->ip_xmit_timeout != TIME_WRITE ||
3408 before(skb->end_seq, sk->rcv_ack_seq + 1))
3409 && sk->packets_out < sk->cong_window)
3410 {
3411 IS_SKB(skb);
3412 skb_unlink(skb);
3413
3414 /*
3415 * See if we really need to send the packet.
3416 */
3417
3418 if (before(skb->end_seq, sk->rcv_ack_seq +1))
3419 {
3420 /*
3421 * This is acked data. We can discard it. This
3422 * cannot currently occur.
3423 */
3424
3425 sk->retransmits = 0;
3426 kfree_skb(skb, FREE_WRITE);
3427 if (!sk->dead)
3428 sk->write_space(sk);
3429 }
3430 else
3431 {
3432 struct tcphdr *th;
3433 struct iphdr *iph;
3434 int size;
3435 /*
3436 * put in the ack seq and window at this point rather than earlier,
3437 * in order to keep them monotonic. We really want to avoid taking
3438 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3439 * Ack and window will in general have changed since this packet was put
3440 * on the write queue.
3441 */
3442 iph = skb->ip_hdr;
3443 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3444 size = skb->len - (((unsigned char *) th) - skb->data);
3445 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3446 if (size > sk->mtu - sizeof(struct iphdr))
3447 {
3448 iph->frag_off &= ~htons(IP_DF);
3449 ip_send_check(iph);
3450 }
3451 #endif
3452
3453 th->ack_seq = htonl(sk->acked_seq);
3454 th->window = htons(tcp_select_window(sk));
3455
3456 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3457
3458 sk->sent_seq = skb->end_seq;
3459
3460 /*
3461 * IP manages our queue for some crazy reason
3462 */
3463
3464 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3465
3466
3467 sk->ack_backlog = 0;
3468 sk->bytes_rcv = 0;
3469
3470 /*
3471 * Again we slide the timer wrongly
3472 */
3473
3474 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3475 }
3476 }
3477 }
3478
3479
3480 /*
3481 * This routine deals with incoming acks, but not outgoing ones.
3482 */
3483
3484 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3485 {
3486 u32 ack;
3487 int flag = 0;
3488
3489 /*
3490 * 1 - there was data in packet as well as ack or new data is sent or
3491 * in shutdown state
3492 * 2 - data from retransmit queue was acked and removed
3493 * 4 - window shrunk or data from retransmit queue was acked and removed
3494 */
3495
3496 if(sk->zapped)
3497 return(1); /* Dead, cant ack any more so why bother */
3498
3499 /*
3500 * Have we discovered a larger window
3501 */
3502
3503 ack = ntohl(th->ack_seq);
3504
3505 if (ntohs(th->window) > sk->max_window)
3506 {
3507 sk->max_window = ntohs(th->window);
3508 #ifdef CONFIG_INET_PCTCP
3509 /* Hack because we don't send partial packets to non SWS
3510 handling hosts */
3511 sk->mss = min(sk->max_window>>1, sk->mtu);
3512 #else
3513 sk->mss = min(sk->max_window, sk->mtu);
3514 #endif
3515 }
3516
3517 /*
3518 * We have dropped back to keepalive timeouts. Thus we have
3519 * no retransmits pending.
3520 */
3521
3522 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3523 sk->retransmits = 0;
3524
3525 /*
3526 * If the ack is newer than sent or older than previous acks
3527 * then we can probably ignore it.
3528 */
3529
3530 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3531 {
3532 if(sk->debug)
3533 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3534
3535 /*
3536 * Keepalive processing.
3537 */
3538
3539 if (after(ack, sk->sent_seq))
3540 {
3541 return(0);
3542 }
3543
3544 /*
3545 * Restart the keepalive timer.
3546 */
3547
3548 if (sk->keepopen)
3549 {
3550 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3551 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3552 }
3553 return(1);
3554 }
3555
3556 /*
3557 * If there is data set flag 1
3558 */
3559
3560 if (len != th->doff*4)
3561 flag |= 1;
3562
3563 /*
3564 * See if our window has been shrunk.
3565 */
3566
3567 if (after(sk->window_seq, ack+ntohs(th->window)))
3568 {
3569 /*
3570 * We may need to move packets from the send queue
3571 * to the write queue, if the window has been shrunk on us.
3572 * The RFC says you are not allowed to shrink your window
3573 * like this, but if the other end does, you must be able
3574 * to deal with it.
3575 */
3576 struct sk_buff *skb;
3577 struct sk_buff *skb2;
3578 struct sk_buff *wskb = NULL;
3579
3580 skb2 = sk->send_head;
3581 sk->send_head = NULL;
3582 sk->send_tail = NULL;
3583
3584 /*
3585 * This is an artifact of a flawed concept. We want one
3586 * queue and a smarter send routine when we send all.
3587 */
3588
3589 flag |= 4; /* Window changed */
3590
3591 sk->window_seq = ack + ntohs(th->window);
3592 cli();
3593 while (skb2 != NULL)
3594 {
3595 skb = skb2;
3596 skb2 = skb->link3;
3597 skb->link3 = NULL;
3598 if (after(skb->end_seq, sk->window_seq))
3599 {
3600 if (sk->packets_out > 0)
3601 sk->packets_out--;
3602 /* We may need to remove this from the dev send list. */
3603 if (skb->next != NULL)
3604 {
3605 skb_unlink(skb);
3606 }
3607 /* Now add it to the write_queue. */
3608 if (wskb == NULL)
3609 skb_queue_head(&sk->write_queue,skb);
3610 else
3611 skb_append(wskb,skb);
3612 wskb = skb;
3613 }
3614 else
3615 {
3616 if (sk->send_head == NULL)
3617 {
3618 sk->send_head = skb;
3619 sk->send_tail = skb;
3620 }
3621 else
3622 {
3623 sk->send_tail->link3 = skb;
3624 sk->send_tail = skb;
3625 }
3626 skb->link3 = NULL;
3627 }
3628 }
3629 sti();
3630 }
3631
3632 /*
3633 * Pipe has emptied
3634 */
3635
3636 if (sk->send_tail == NULL || sk->send_head == NULL)
3637 {
3638 sk->send_head = NULL;
3639 sk->send_tail = NULL;
3640 sk->packets_out= 0;
3641 }
3642
3643 /*
3644 * Update the right hand window edge of the host
3645 */
3646
3647 sk->window_seq = ack + ntohs(th->window);
3648
3649 /*
3650 * We don't want too many packets out there.
3651 */
3652
3653 if (sk->ip_xmit_timeout == TIME_WRITE &&
3654 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3655 {
3656 /*
3657 * This is Jacobson's slow start and congestion avoidance.
3658 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3659 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3660 * counter and increment it once every cwnd times. It's possible
3661 * that this should be done only if sk->retransmits == 0. I'm
3662 * interpreting "new data is acked" as including data that has
3663 * been retransmitted but is just now being acked.
3664 */
3665 if (sk->cong_window < sk->ssthresh)
3666 /*
3667 * In "safe" area, increase
3668 */
3669 sk->cong_window++;
3670 else
3671 {
3672 /*
3673 * In dangerous area, increase slowly. In theory this is
3674 * sk->cong_window += 1 / sk->cong_window
3675 */
3676 if (sk->cong_count >= sk->cong_window)
3677 {
3678 sk->cong_window++;
3679 sk->cong_count = 0;
3680 }
3681 else
3682 sk->cong_count++;
3683 }
3684 }
3685
3686 /*
3687 * Remember the highest ack received.
3688 */
3689
3690 sk->rcv_ack_seq = ack;
3691
3692 /*
3693 * We passed data and got it acked, remove any soft error
3694 * log. Something worked...
3695 */
3696
3697 sk->err_soft = 0;
3698
3699 /*
3700 * If this ack opens up a zero window, clear backoff. It was
3701 * being used to time the probes, and is probably far higher than
3702 * it needs to be for normal retransmission.
3703 */
3704
3705 if (sk->ip_xmit_timeout == TIME_PROBE0)
3706 {
3707 sk->retransmits = 0; /* Our probe was answered */
3708
3709 /*
3710 * Was it a usable window open ?
3711 */
3712
3713 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3714 ! before (sk->window_seq, sk->write_queue.next->end_seq))
3715 {
3716 sk->backoff = 0;
3717
3718 /*
3719 * Recompute rto from rtt. this eliminates any backoff.
3720 */
3721
3722 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3723 if (sk->rto > 120*HZ)
3724 sk->rto = 120*HZ;
3725 if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about
3726 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3727 .2 of a second is going to need huge windows (SIGH) */
3728 sk->rto = HZ/5;
3729 }
3730 }
3731
3732 /*
3733 * See if we can take anything off of the retransmit queue.
3734 */
3735
3736 while(sk->send_head != NULL)
3737 {
3738 /* Check for a bug. */
3739 if (sk->send_head->link3 &&
3740 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
3741 printk("INET: tcp.c: *** bug send_list out of order.\n");
3742
3743 /*
3744 * If our packet is before the ack sequence we can
3745 * discard it as it's confirmed to have arrived the other end.
3746 */
3747
3748 if (before(sk->send_head->end_seq, ack+1))
3749 {
3750 struct sk_buff *oskb;
3751 if (sk->retransmits)
3752 {
3753 /*
3754 * We were retransmitting. don't count this in RTT est
3755 */
3756 flag |= 2;
3757
3758 /*
3759 * even though we've gotten an ack, we're still
3760 * retransmitting as long as we're sending from
3761 * the retransmit queue. Keeping retransmits non-zero
3762 * prevents us from getting new data interspersed with
3763 * retransmissions.
3764 */
3765
3766 if (sk->send_head->link3) /* Any more queued retransmits? */
3767 sk->retransmits = 1;
3768 else
3769 sk->retransmits = 0;
3770 }
3771 /*
3772 * Note that we only reset backoff and rto in the
3773 * rtt recomputation code. And that doesn't happen
3774 * if there were retransmissions in effect. So the
3775 * first new packet after the retransmissions is
3776 * sent with the backoff still in effect. Not until
3777 * we get an ack from a non-retransmitted packet do
3778 * we reset the backoff and rto. This allows us to deal
3779 * with a situation where the network delay has increased
3780 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3781 */
3782
3783 /*
3784 * We have one less packet out there.
3785 */
3786
3787 if (sk->packets_out > 0)
3788 sk->packets_out --;
3789 /*
3790 * Wake up the process, it can probably write more.
3791 */
3792 if (!sk->dead)
3793 sk->write_space(sk);
3794 oskb = sk->send_head;
3795
3796 if (!(flag&2)) /* Not retransmitting */
3797 {
3798 long m;
3799
3800 /*
3801 * The following amusing code comes from Jacobson's
3802 * article in SIGCOMM '88. Note that rtt and mdev
3803 * are scaled versions of rtt and mean deviation.
3804 * This is designed to be as fast as possible
3805 * m stands for "measurement".
3806 */
3807
3808 m = jiffies - oskb->when; /* RTT */
3809 if(m<=0)
3810 m=1; /* IS THIS RIGHT FOR <0 ??? */
3811 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3812 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3813 if (m < 0)
3814 m = -m; /* m is now abs(error) */
3815 m -= (sk->mdev >> 2); /* similar update on mdev */
3816 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3817
3818 /*
3819 * Now update timeout. Note that this removes any backoff.
3820 */
3821
3822 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3823 if (sk->rto > 120*HZ)
3824 sk->rto = 120*HZ;
3825 if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3826 sk->rto = HZ/5;
3827 sk->backoff = 0;
3828 }
3829 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3830 In this case as we just set it up */
3831 cli();
3832 oskb = sk->send_head;
3833 IS_SKB(oskb);
3834 sk->send_head = oskb->link3;
3835 if (sk->send_head == NULL)
3836 {
3837 sk->send_tail = NULL;
3838 }
3839
3840 /*
3841 * We may need to remove this from the dev send list.
3842 */
3843
3844 if (oskb->next)
3845 skb_unlink(oskb);
3846 sti();
3847 kfree_skb(oskb, FREE_WRITE); /* write. */
3848 if (!sk->dead)
3849 sk->write_space(sk);
3850 }
3851 else
3852 {
3853 break;
3854 }
3855 }
3856
3857 /*
3858 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3859 * returns non-NULL, we complete ignore the timer stuff in the else
3860 * clause. We ought to organize the code so that else clause can
3861 * (should) be executed regardless, possibly moving the PROBE timer
3862 * reset over. The skb_peek() thing should only move stuff to the
3863 * write queue, NOT also manage the timer functions.
3864 */
3865
3866 /*
3867 * Maybe we can take some stuff off of the write queue,
3868 * and put it onto the xmit queue.
3869 */
3870 if (skb_peek(&sk->write_queue) != NULL)
3871 {
3872 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3873 (sk->retransmits == 0 ||
3874 sk->ip_xmit_timeout != TIME_WRITE ||
3875 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3876 && sk->packets_out < sk->cong_window)
3877 {
3878 /*
3879 * Add more data to the send queue.
3880 */
3881 flag |= 1;
3882 tcp_write_xmit(sk);
3883 }
3884 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3885 sk->send_head == NULL &&
3886 sk->ack_backlog == 0 &&
3887 sk->state != TCP_TIME_WAIT)
3888 {
3889 /*
3890 * Data to queue but no room.
3891 */
3892 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3893 }
3894 }
3895 else
3896 {
3897 /*
3898 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3899 * from TCP_CLOSE we don't do anything
3900 *
3901 * from anything else, if there is write data (or fin) pending,
3902 * we use a TIME_WRITE timeout, else if keepalive we reset to
3903 * a KEEPALIVE timeout, else we delete the timer.
3904 *
3905 * We do not set flag for nominal write data, otherwise we may
3906 * force a state where we start to write itsy bitsy tidbits
3907 * of data.
3908 */
3909
3910 switch(sk->state) {
3911 case TCP_TIME_WAIT:
3912 /*
3913 * keep us in TIME_WAIT until we stop getting packets,
3914 * reset the timeout.
3915 */
3916 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3917 break;
3918 case TCP_CLOSE:
3919 /*
3920 * don't touch the timer.
3921 */
3922 break;
3923 default:
3924 /*
3925 * Must check send_head, write_queue, and ack_backlog
3926 * to determine which timeout to use.
3927 */
3928 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3929 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3930 } else if (sk->keepopen) {
3931 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3932 } else {
3933 del_timer(&sk->retransmit_timer);
3934 sk->ip_xmit_timeout = 0;
3935 }
3936 break;
3937 }
3938 }
3939
3940 /*
3941 * We have nothing queued but space to send. Send any partial
3942 * packets immediately (end of Nagle rule application).
3943 */
3944
3945 if (sk->packets_out == 0 && sk->partial != NULL &&
3946 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3947 {
3948 flag |= 1;
3949 tcp_send_partial(sk);
3950 }
3951
3952 /*
3953 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3954 * we are now waiting for an acknowledge to our FIN. The other end is
3955 * already in TIME_WAIT.
3956 *
3957 * Move to TCP_CLOSE on success.
3958 */
3959
3960 if (sk->state == TCP_LAST_ACK)
3961 {
3962 if (!sk->dead)
3963 sk->state_change(sk);
3964 if(sk->debug)
3965 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3966 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3967 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3968 {
3969 flag |= 1;
3970 sk->shutdown = SHUTDOWN_MASK;
3971 tcp_set_state(sk,TCP_CLOSE);
3972 return 1;
3973 }
3974 }
3975
3976 /*
3977 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3978 *
3979 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3980 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3981 */
3982
3983 if (sk->state == TCP_FIN_WAIT1)
3984 {
3985
3986 if (!sk->dead)
3987 sk->state_change(sk);
3988 if (sk->rcv_ack_seq == sk->write_seq)
3989 {
3990 flag |= 1;
3991 sk->shutdown |= SEND_SHUTDOWN;
3992 tcp_set_state(sk, TCP_FIN_WAIT2);
3993 }
3994 }
3995
3996 /*
3997 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3998 *
3999 * Move to TIME_WAIT
4000 */
4001
4002 if (sk->state == TCP_CLOSING)
4003 {
4004
4005 if (!sk->dead)
4006 sk->state_change(sk);
4007 if (sk->rcv_ack_seq == sk->write_seq)
4008 {
4009 flag |= 1;
4010 tcp_time_wait(sk);
4011 }
4012 }
4013
4014 /*
4015 * Final ack of a three way shake
4016 */
4017
4018 if(sk->state==TCP_SYN_RECV)
4019 {
4020 tcp_set_state(sk, TCP_ESTABLISHED);
4021 tcp_options(sk,th);
4022 sk->dummy_th.dest=th->source;
4023 sk->copied_seq = sk->acked_seq;
4024 if(!sk->dead)
4025 sk->state_change(sk);
4026 if(sk->max_window==0)
4027 {
4028 sk->max_window=32; /* Sanity check */
4029 sk->mss=min(sk->max_window,sk->mtu);
4030 }
4031 }
4032
4033 /*
4034 * I make no guarantees about the first clause in the following
4035 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
4036 * what conditions "!flag" would be true. However I think the rest
4037 * of the conditions would prevent that from causing any
4038 * unnecessary retransmission.
4039 * Clearly if the first packet has expired it should be
4040 * retransmitted. The other alternative, "flag&2 && retransmits", is
4041 * harder to explain: You have to look carefully at how and when the
4042 * timer is set and with what timeout. The most recent transmission always
4043 * sets the timer. So in general if the most recent thing has timed
4044 * out, everything before it has as well. So we want to go ahead and
4045 * retransmit some more. If we didn't explicitly test for this
4046 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
4047 * would not be true. If you look at the pattern of timing, you can
4048 * show that rto is increased fast enough that the next packet would
4049 * almost never be retransmitted immediately. Then you'd end up
4050 * waiting for a timeout to send each packet on the retransmission
4051 * queue. With my implementation of the Karn sampling algorithm,
4052 * the timeout would double each time. The net result is that it would
4053 * take a hideous amount of time to recover from a single dropped packet.
4054 * It's possible that there should also be a test for TIME_WRITE, but
4055 * I think as long as "send_head != NULL" and "retransmit" is on, we've
4056 * got to be in real retransmission mode.
4057 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
4058 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
4059 * As long as no further losses occur, this seems reasonable.
4060 */
4061
4062 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
4063 (((flag&2) && sk->retransmits) ||
4064 (sk->send_head->when + sk->rto < jiffies)))
4065 {
4066 if(sk->send_head->when + sk->rto < jiffies)
4067 tcp_retransmit(sk,0);
4068 else
4069 {
4070 tcp_do_retransmit(sk, 1);
4071 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4072 }
4073 }
4074
4075 return(1);
4076 }
4077
4078
4079 /*
4080 * Process the FIN bit. This now behaves as it is supposed to work
4081 * and the FIN takes effect when it is validly part of sequence
4082 * space. Not before when we get holes.
4083 *
4084 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4085 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
4086 * TIME-WAIT)
4087 *
4088 * If we are in FINWAIT-1, a received FIN indicates simultaneous
4089 * close and we go into CLOSING (and later onto TIME-WAIT)
4090 *
4091 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4092 *
4093 */
4094
4095 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4096 {
4097 sk->fin_seq = skb->end_seq;
4098
4099 if (!sk->dead)
4100 {
4101 sk->state_change(sk);
4102 sock_wake_async(sk->socket, 1);
4103 }
4104
4105 switch(sk->state)
4106 {
4107 case TCP_SYN_RECV:
4108 case TCP_SYN_SENT:
4109 case TCP_ESTABLISHED:
4110 /*
4111 * move to CLOSE_WAIT, tcp_data() already handled
4112 * sending the ack.
4113 */
4114 tcp_set_state(sk,TCP_CLOSE_WAIT);
4115 if (th->rst)
4116 sk->shutdown = SHUTDOWN_MASK;
4117 break;
4118
4119 case TCP_CLOSE_WAIT:
4120 case TCP_CLOSING:
4121 /*
4122 * received a retransmission of the FIN, do
4123 * nothing.
4124 */
4125 break;
4126 case TCP_TIME_WAIT:
4127 /*
4128 * received a retransmission of the FIN,
4129 * restart the TIME_WAIT timer.
4130 */
4131 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4132 return(0);
4133 case TCP_FIN_WAIT1:
4134 /*
4135 * This case occurs when a simultaneous close
4136 * happens, we must ack the received FIN and
4137 * enter the CLOSING state.
4138 *
4139 * This causes a WRITE timeout, which will either
4140 * move on to TIME_WAIT when we timeout, or resend
4141 * the FIN properly (maybe we get rid of that annoying
4142 * FIN lost hang). The TIME_WRITE code is already correct
4143 * for handling this timeout.
4144 */
4145
4146 if(sk->ip_xmit_timeout != TIME_WRITE)
4147 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4148 tcp_set_state(sk,TCP_CLOSING);
4149 break;
4150 case TCP_FIN_WAIT2:
4151 /*
4152 * received a FIN -- send ACK and enter TIME_WAIT
4153 */
4154 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4155 sk->shutdown|=SHUTDOWN_MASK;
4156 tcp_set_state(sk,TCP_TIME_WAIT);
4157 break;
4158 case TCP_CLOSE:
4159 /*
4160 * already in CLOSE
4161 */
4162 break;
4163 default:
4164 tcp_set_state(sk,TCP_LAST_ACK);
4165
4166 /* Start the timers. */
4167 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4168 return(0);
4169 }
4170
4171 return(0);
4172 }
4173
4174
4175
4176 /*
4177 * This routine handles the data. If there is room in the buffer,
4178 * it will be have already been moved into it. If there is no
4179 * room, then we will just have to discard the packet.
4180 */
4181
4182 extern /* __inline__ */ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4183 unsigned long saddr, unsigned short len)
4184 {
4185 struct sk_buff *skb1, *skb2;
4186 struct tcphdr *th;
4187 int dup_dumped=0;
4188 u32 new_seq, shut_seq;
4189
4190 th = skb->h.th;
4191 skb_pull(skb,th->doff*4);
4192 skb_trim(skb,len-(th->doff*4));
4193
4194 /*
4195 * The bytes in the receive read/assembly queue has increased. Needed for the
4196 * low memory discard algorithm
4197 */
4198
4199 sk->bytes_rcv += skb->len;
4200
4201 if (skb->len == 0 && !th->fin)
4202 {
4203 /*
4204 * Don't want to keep passing ack's back and forth.
4205 * (someone sent us dataless, boring frame)
4206 */
4207 if (!th->ack)
4208 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4209 kfree_skb(skb, FREE_READ);
4210 return(0);
4211 }
4212
4213 /*
4214 * We no longer have anyone receiving data on this connection.
4215 */
4216
4217 #ifndef TCP_DONT_RST_SHUTDOWN
4218
4219 if(sk->shutdown & RCV_SHUTDOWN)
4220 {
4221 /*
4222 * FIXME: BSD has some magic to avoid sending resets to
4223 * broken 4.2 BSD keepalives. Much to my surprise a few non
4224 * BSD stacks still have broken keepalives so we want to
4225 * cope with it.
4226 */
4227
4228 if(skb->len) /* We don't care if it's just an ack or
4229 a keepalive/window probe */
4230 {
4231 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
4232
4233 /* Do this the way 4.4BSD treats it. Not what I'd
4234 regard as the meaning of the spec but it's what BSD
4235 does and clearly they know everything 8) */
4236
4237 /*
4238 * This is valid because of two things
4239 *
4240 * a) The way tcp_data behaves at the bottom.
4241 * b) A fin takes effect when read not when received.
4242 */
4243
4244 shut_seq = sk->acked_seq+1; /* Last byte */
4245
4246 if(after(new_seq,shut_seq))
4247 {
4248 if(sk->debug)
4249 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4250 sk, new_seq, shut_seq, sk->blog);
4251 if(sk->dead)
4252 {
4253 sk->acked_seq = new_seq + th->fin;
4254 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4255 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4256 tcp_statistics.TcpEstabResets++;
4257 sk->err = EPIPE;
4258 sk->error_report(sk);
4259 sk->shutdown = SHUTDOWN_MASK;
4260 tcp_set_state(sk,TCP_CLOSE);
4261 kfree_skb(skb, FREE_READ);
4262 return 0;
4263 }
4264 }
4265 }
4266 }
4267
4268 #endif
4269
4270 /*
4271 * Now we have to walk the chain, and figure out where this one
4272 * goes into it. This is set up so that the last packet we received
4273 * will be the first one we look at, that way if everything comes
4274 * in order, there will be no performance loss, and if they come
4275 * out of order we will be able to fit things in nicely.
4276 *
4277 * [AC: This is wrong. We should assume in order first and then walk
4278 * forwards from the first hole based upon real traffic patterns.]
4279 *
4280 */
4281
4282 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
4283 {
4284 skb_queue_head(&sk->receive_queue,skb);
4285 skb1= NULL;
4286 }
4287 else
4288 {
4289 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4290 {
4291 if(sk->debug)
4292 {
4293 printk("skb1=%p :", skb1);
4294 printk("skb1->seq = %d: ", skb1->seq);
4295 printk("skb->seq = %d\n",skb->seq);
4296 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4297 sk->acked_seq);
4298 }
4299
4300 /*
4301 * Optimisation: Duplicate frame or extension of previous frame from
4302 * same sequence point (lost ack case).
4303 * The frame contains duplicate data or replaces a previous frame
4304 * discard the previous frame (safe as sk->inuse is set) and put
4305 * the new one in its place.
4306 */
4307
4308 if (skb->seq==skb1->seq && skb->len>=skb1->len)
4309 {
4310 skb_append(skb1,skb);
4311 skb_unlink(skb1);
4312 kfree_skb(skb1,FREE_READ);
4313 dup_dumped=1;
4314 skb1=NULL;
4315 break;
4316 }
4317
4318 /*
4319 * Found where it fits
4320 */
4321
4322 if (after(skb->seq+1, skb1->seq))
4323 {
4324 skb_append(skb1,skb);
4325 break;
4326 }
4327
4328 /*
4329 * See if we've hit the start. If so insert.
4330 */
4331 if (skb1 == skb_peek(&sk->receive_queue))
4332 {
4333 skb_queue_head(&sk->receive_queue, skb);
4334 break;
4335 }
4336 }
4337 }
4338
4339 /*
4340 * Figure out what the ack value for this frame is
4341 */
4342
4343 if (before(sk->acked_seq, sk->copied_seq))
4344 {
4345 printk("*** tcp.c:tcp_data bug acked < copied\n");
4346 sk->acked_seq = sk->copied_seq;
4347 }
4348
4349 /*
4350 * Now figure out if we can ack anything. This is very messy because we really want two
4351 * receive queues, a completed and an assembly queue. We also want only one transmit
4352 * queue.
4353 */
4354
4355 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
4356 {
4357 if (before(skb->seq, sk->acked_seq+1))
4358 {
4359
4360 if (after(skb->end_seq, sk->acked_seq))
4361 sk->acked_seq = skb->end_seq;
4362
4363 skb->acked = 1;
4364
4365 /*
4366 * When we ack the fin, we do the FIN
4367 * processing.
4368 */
4369
4370 if (skb->h.th->fin)
4371 {
4372 tcp_fin(skb,sk,skb->h.th);
4373 }
4374
4375 for(skb2 = skb->next;
4376 skb2 != (struct sk_buff *)&sk->receive_queue;
4377 skb2 = skb2->next)
4378 {
4379 if (before(skb2->seq, sk->acked_seq+1))
4380 {
4381 if (after(skb2->end_seq, sk->acked_seq))
4382 sk->acked_seq = skb2->end_seq;
4383
4384 skb2->acked = 1;
4385 /*
4386 * When we ack the fin, we do
4387 * the fin handling.
4388 */
4389 if (skb2->h.th->fin)
4390 {
4391 tcp_fin(skb,sk,skb->h.th);
4392 }
4393
4394 /*
4395 * Force an immediate ack.
4396 */
4397
4398 sk->ack_backlog = sk->max_ack_backlog;
4399 }
4400 else
4401 {
4402 break;
4403 }
4404 }
4405
4406 /*
4407 * This also takes care of updating the window.
4408 * This if statement needs to be simplified.
4409 *
4410 * rules for delaying an ack:
4411 * - delay time <= 0.5 HZ
4412 * - we don't have a window update to send
4413 * - must send at least every 2 full sized packets
4414 */
4415 if (!sk->delay_acks ||
4416 sk->ack_backlog >= sk->max_ack_backlog ||
4417 sk->bytes_rcv > sk->max_unacked || th->fin ||
4418 sk->ato > HZ/2 ||
4419 tcp_raise_window(sk)) {
4420 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4421 }
4422 else
4423 {
4424 sk->ack_backlog++;
4425
4426 if(sk->debug)
4427 printk("Ack queued.\n");
4428 reset_xmit_timer(sk, TIME_WRITE, sk->ato);
4429 }
4430 }
4431 }
4432
4433 /*
4434 * If we've missed a packet, send an ack.
4435 * Also start a timer to send another.
4436 */
4437
4438 if (!skb->acked)
4439 {
4440
4441 /*
4442 * This is important. If we don't have much room left,
4443 * we need to throw out a few packets so we have a good
4444 * window. Note that mtu is used, not mss, because mss is really
4445 * for the send side. He could be sending us stuff as large as mtu.
4446 */
4447
4448 while (sock_rspace(sk) < sk->mtu)
4449 {
4450 skb1 = skb_peek(&sk->receive_queue);
4451 if (skb1 == NULL)
4452 {
4453 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4454 break;
4455 }
4456
4457 /*
4458 * Don't throw out something that has been acked.
4459 */
4460
4461 if (skb1->acked)
4462 {
4463 break;
4464 }
4465
4466 skb_unlink(skb1);
4467 kfree_skb(skb1, FREE_READ);
4468 }
4469 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4470 sk->ack_backlog++;
4471 reset_xmit_timer(sk, TIME_WRITE, min(sk->ato, 0.5 * HZ));
4472 }
4473 else
4474 {
4475 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4476 }
4477
4478 /*
4479 * Now tell the user we may have some data.
4480 */
4481
4482 if (!sk->dead)
4483 {
4484 if(sk->debug)
4485 printk("Data wakeup.\n");
4486 sk->data_ready(sk,0);
4487 }
4488 return(0);
4489 }
4490
4491
4492 /*
4493 * This routine is only called when we have urgent data
4494 * signalled. Its the 'slow' part of tcp_urg. It could be
4495 * moved inline now as tcp_urg is only called from one
4496 * place. We handle URGent data wrong. We have to - as
4497 * BSD still doesn't use the correction from RFC961.
4498 */
4499
4500 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4501 {
4502 u32 ptr = ntohs(th->urg_ptr);
4503
4504 if (ptr)
4505 ptr--;
4506 ptr += ntohl(th->seq);
4507
4508 /* ignore urgent data that we've already seen and read */
4509 if (after(sk->copied_seq, ptr))
4510 return;
4511
4512 /* do we already have a newer (or duplicate) urgent pointer? */
4513 if (sk->urg_data && !after(ptr, sk->urg_seq))
4514 return;
4515
4516 /* tell the world about our new urgent pointer */
4517 if (sk->proc != 0) {
4518 if (sk->proc > 0) {
4519 kill_proc(sk->proc, SIGURG, 1);
4520 } else {
4521 kill_pg(-sk->proc, SIGURG, 1);
4522 }
4523 }
4524 sk->urg_data = URG_NOTYET;
4525 sk->urg_seq = ptr;
4526 }
4527
4528 /*
4529 * This is the 'fast' part of urgent handling.
4530 */
4531
4532 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4533 unsigned long saddr, unsigned long len)
4534 {
4535 u32 ptr;
4536
4537 /*
4538 * Check if we get a new urgent pointer - normally not
4539 */
4540
4541 if (th->urg)
4542 tcp_check_urg(sk,th);
4543
4544 /*
4545 * Do we wait for any urgent data? - normally not
4546 */
4547
4548 if (sk->urg_data != URG_NOTYET)
4549 return 0;
4550
4551 /*
4552 * Is the urgent pointer pointing into this packet?
4553 */
4554
4555 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4556 if (ptr >= len)
4557 return 0;
4558
4559 /*
4560 * Ok, got the correct packet, update info
4561 */
4562
4563 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4564 if (!sk->dead)
4565 sk->data_ready(sk,0);
4566 return 0;
4567 }
4568
4569 /*
4570 * This will accept the next outstanding connection.
4571 */
4572
4573 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4574 {
4575 struct sock *newsk;
4576 struct sk_buff *skb;
4577
4578 /*
4579 * We need to make sure that this socket is listening,
4580 * and that it has something pending.
4581 */
4582
4583 if (sk->state != TCP_LISTEN)
4584 {
4585 sk->err = EINVAL;
4586 return(NULL);
4587 }
4588
4589 /* Avoid the race. */
4590 cli();
4591 sk->inuse = 1;
4592
4593 while((skb = tcp_dequeue_established(sk)) == NULL)
4594 {
4595 if (flags & O_NONBLOCK)
4596 {
4597 sti();
4598 release_sock(sk);
4599 sk->err = EAGAIN;
4600 return(NULL);
4601 }
4602
4603 release_sock(sk);
4604 interruptible_sleep_on(sk->sleep);
4605 if (current->signal & ~current->blocked)
4606 {
4607 sti();
4608 sk->err = ERESTARTSYS;
4609 return(NULL);
4610 }
4611 sk->inuse = 1;
4612 }
4613 sti();
4614
4615 /*
4616 * Now all we need to do is return skb->sk.
4617 */
4618
4619 newsk = skb->sk;
4620
4621 kfree_skb(skb, FREE_READ);
4622 sk->ack_backlog--;
4623 release_sock(sk);
4624 return(newsk);
4625 }
4626
4627
4628 /*
4629 * This will initiate an outgoing connection.
4630 */
4631
4632 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4633 {
4634 struct sk_buff *buff;
4635 struct device *dev=NULL;
4636 unsigned char *ptr;
4637 int tmp;
4638 int atype;
4639 struct tcphdr *t1;
4640 struct rtable *rt;
4641
4642 if (sk->state != TCP_CLOSE)
4643 return(-EISCONN);
4644
4645 /*
4646 * Don't allow a double connect.
4647 */
4648
4649 if(sk->daddr)
4650 return -EINVAL;
4651
4652 if (addr_len < 8)
4653 return(-EINVAL);
4654
4655 if (usin->sin_family && usin->sin_family != AF_INET)
4656 return(-EAFNOSUPPORT);
4657
4658 /*
4659 * connect() to INADDR_ANY means loopback (BSD'ism).
4660 */
4661
4662 if(usin->sin_addr.s_addr==INADDR_ANY)
4663 usin->sin_addr.s_addr=ip_my_addr();
4664
4665 /*
4666 * Don't want a TCP connection going to a broadcast address
4667 */
4668
4669 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4670 return -ENETUNREACH;
4671
4672 sk->inuse = 1;
4673 sk->daddr = usin->sin_addr.s_addr;
4674 sk->write_seq = tcp_init_seq();
4675 sk->window_seq = sk->write_seq;
4676 sk->rcv_ack_seq = sk->write_seq -1;
4677 sk->err = 0;
4678 sk->dummy_th.dest = usin->sin_port;
4679 release_sock(sk);
4680
4681 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4682 if (buff == NULL)
4683 {
4684 return(-ENOMEM);
4685 }
4686 sk->inuse = 1;
4687 buff->sk = sk;
4688 buff->free = 0;
4689 buff->localroute = sk->localroute;
4690
4691
4692 /*
4693 * Put in the IP header and routing stuff.
4694 */
4695
4696 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4697 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4698 if (tmp < 0)
4699 {
4700 sock_wfree(sk, buff);
4701 release_sock(sk);
4702 return(-ENETUNREACH);
4703 }
4704 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4705 sk->saddr = rt->rt_src;
4706 sk->rcv_saddr = sk->saddr;
4707
4708 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4709
4710 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4711 buff->seq = sk->write_seq++;
4712 t1->seq = htonl(buff->seq);
4713 sk->sent_seq = sk->write_seq;
4714 buff->end_seq = sk->write_seq;
4715 t1->ack = 0;
4716 t1->window = 2;
4717 t1->res1=0;
4718 t1->res2=0;
4719 t1->rst = 0;
4720 t1->urg = 0;
4721 t1->psh = 0;
4722 t1->syn = 1;
4723 t1->urg_ptr = 0;
4724 t1->doff = 6;
4725 /* use 512 or whatever user asked for */
4726
4727 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4728 sk->window_clamp=rt->rt_window;
4729 else
4730 sk->window_clamp=0;
4731
4732 if (sk->user_mss)
4733 sk->mtu = sk->user_mss;
4734 else if (rt)
4735 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4736 else
4737 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4738
4739 /*
4740 * but not bigger than device MTU
4741 */
4742
4743 if(sk->mtu <32)
4744 sk->mtu = 32; /* Sanity limit */
4745
4746 sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4747
4748 #ifdef CONFIG_SKIP
4749
4750 /*
4751 * SKIP devices set their MTU to 65535. This is so they can take packets
4752 * unfragmented to security process then fragment. They could lie to the
4753 * TCP layer about a suitable MTU, but its easier to let skip sort it out
4754 * simply because the final package we want unfragmented is going to be
4755 *
4756 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4757 */
4758
4759 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
4760 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4761 #endif
4762
4763 /*
4764 * Put in the TCP options to say MTU.
4765 */
4766
4767 ptr = skb_put(buff,4);
4768 ptr[0] = 2;
4769 ptr[1] = 4;
4770 ptr[2] = (sk->mtu) >> 8;
4771 ptr[3] = (sk->mtu) & 0xff;
4772 tcp_send_check(t1, sk->saddr, sk->daddr,
4773 sizeof(struct tcphdr) + 4, sk);
4774
4775 /*
4776 * This must go first otherwise a really quick response will get reset.
4777 */
4778
4779 tcp_cache_zap();
4780 tcp_set_state(sk,TCP_SYN_SENT);
4781 if(rt&&rt->rt_flags&RTF_IRTT)
4782 sk->rto = rt->rt_irtt;
4783 else
4784 sk->rto = TCP_TIMEOUT_INIT;
4785 sk->retransmit_timer.function=&retransmit_timer;
4786 sk->retransmit_timer.data = (unsigned long)sk;
4787 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4788 sk->retransmits = 0; /* Now works the right way instead of a hacked
4789 initial setting */
4790
4791 sk->prot->queue_xmit(sk, dev, buff, 0);
4792 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4793 tcp_statistics.TcpActiveOpens++;
4794 tcp_statistics.TcpOutSegs++;
4795
4796 release_sock(sk);
4797 return(0);
4798 }
4799
4800 /*
4801 * React to a out-of-window TCP sequence number in an incoming packet
4802 */
4803 static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4804 struct options *opt, unsigned long saddr, struct device *dev)
4805 {
4806 if (th->rst)
4807 return;
4808
4809 /*
4810 * Send a reset if we get something not ours and we are
4811 * unsynchronized. Note: We don't do anything to our end. We
4812 * are just killing the bogus remote connection then we will
4813 * connect again and it will work (with luck).
4814 */
4815
4816 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4817 {
4818 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4819 return;
4820 }
4821
4822 /* Try to resync things. */
4823 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4824 return;
4825 }
4826
4827 /*
4828 * This functions checks to see if the tcp header is actually acceptable.
4829 */
4830
4831 extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4832 {
4833 /* does the packet contain any unseen data AND */
4834 /* does the packet start before the window? */
4835 return after(end_seq+1, sk->acked_seq) &&
4836 before(seq, sk->acked_seq + sk->window + 1);
4837 }
4838
4839 /*
4840 * When we get a reset we do this.
4841 */
4842
4843 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4844 {
4845 sk->zapped = 1;
4846 sk->err = ECONNRESET;
4847 if (sk->state == TCP_SYN_SENT)
4848 sk->err = ECONNREFUSED;
4849 if (sk->state == TCP_CLOSE_WAIT)
4850 sk->err = EPIPE;
4851 #ifdef TCP_DO_RFC1337
4852 /*
4853 * Time wait assassination protection [RFC1337]
4854 */
4855 if(sk->state!=TCP_TIME_WAIT)
4856 {
4857 tcp_set_state(sk,TCP_CLOSE);
4858 sk->shutdown = SHUTDOWN_MASK;
4859 }
4860 #else
4861 tcp_set_state(sk,TCP_CLOSE);
4862 sk->shutdown = SHUTDOWN_MASK;
4863 #endif
4864 if (!sk->dead)
4865 sk->state_change(sk);
4866 kfree_skb(skb, FREE_READ);
4867 release_sock(sk);
4868 return(0);
4869 }
4870
4871 /*
4872 * Find the socket, using the last hit cache if applicable.
4873 */
4874 static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4875 {
4876 struct sock * sk;
4877
4878 sk = (struct sock *) th_cache_sk;
4879 if (saddr != th_cache_saddr || daddr != th_cache_daddr ||
4880 sport != th_cache_sport || dport != th_cache_dport) {
4881 sk = get_sock(&tcp_prot, dport, saddr, sport, daddr);
4882 if (sk) {
4883 th_cache_saddr=saddr;
4884 th_cache_daddr=daddr;
4885 th_cache_dport=dport;
4886 th_cache_sport=sport;
4887 th_cache_sk=sk;
4888 }
4889 }
4890 return sk;
4891 }
4892
4893
4894 /*
4895 * A TCP packet has arrived.
4896 * skb->h.raw is the TCP header.
4897 */
4898
4899 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4900 __u32 daddr, unsigned short len,
4901 __u32 saddr, int redo, struct inet_protocol * protocol)
4902 {
4903 struct tcphdr *th;
4904 struct sock *sk;
4905 int syn_ok=0;
4906
4907 /*
4908 * "redo" is 1 if we have already seen this skb but couldn't
4909 * use it at that time (the socket was locked). In that case
4910 * we have already done a lot of the work (looked up the socket
4911 * etc).
4912 */
4913 th = skb->h.th;
4914 sk = skb->sk;
4915 if (!redo) {
4916 tcp_statistics.TcpInSegs++;
4917 if (skb->pkt_type!=PACKET_HOST)
4918 {
4919 kfree_skb(skb,FREE_READ);
4920 return(0);
4921 }
4922 /*
4923 * Pull up the IP header.
4924 */
4925 skb_pull(skb, skb->h.raw-skb->data);
4926 /*
4927 * Try to use the device checksum if provided.
4928 */
4929 if (
4930 ((skb->ip_summed == CHECKSUM_HW) && tcp_check(th, len, saddr, daddr, skb->csum ))||
4931 ((skb->ip_summed == CHECKSUM_NONE) && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4932 /* skip if CHECKSUM_UNNECESSARY */
4933 )
4934 {
4935 skb->sk = NULL;
4936 kfree_skb(skb,FREE_READ);
4937 /*
4938 * We don't release the socket because it was
4939 * never marked in use.
4940 */
4941 return(0);
4942 }
4943 sk = get_tcp_sock(saddr, th->source, daddr, th->dest);
4944 if (!sk)
4945 goto no_tcp_socket;
4946 skb->sk = sk;
4947 skb->seq = ntohl(th->seq);
4948 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4949 skb->ack_seq = ntohl(th->ack_seq);
4950
4951 skb->acked = 0;
4952 skb->used = 0;
4953 skb->free = 0;
4954 skb->saddr = daddr;
4955 skb->daddr = saddr;
4956
4957 /* We may need to add it to the backlog here. */
4958 cli();
4959 if (sk->inuse)
4960 {
4961 skb_queue_tail(&sk->back_log, skb);
4962 sti();
4963 return(0);
4964 }
4965 sk->inuse = 1;
4966 sti();
4967 }
4968
4969 /*
4970 * If this socket has got a reset it's to all intents and purposes
4971 * really dead. Count closed sockets as dead.
4972 *
4973 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4974 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4975 * exist so should cause resets as if the port was unreachable.
4976 */
4977
4978 if (sk->zapped || sk->state==TCP_CLOSE)
4979 goto no_tcp_socket;
4980
4981 if (!sk->prot)
4982 {
4983 printk("IMPOSSIBLE 3\n");
4984 return(0);
4985 }
4986
4987
4988 /*
4989 * Charge the memory to the socket.
4990 */
4991
4992 skb->sk=sk;
4993 sk->rmem_alloc += skb->truesize;
4994
4995 /*
4996 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4997 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4998 * compatibility. We also set up variables more thoroughly [Karn notes in the
4999 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
5000 */
5001
5002 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
5003 {
5004
5005 /*
5006 * Now deal with unusual cases.
5007 */
5008
5009 if(sk->state==TCP_LISTEN)
5010 {
5011 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
5012 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
5013
5014 /*
5015 * We don't care for RST, and non SYN are absorbed (old segments)
5016 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
5017 * netmask on a running connection it can go broadcast. Even Sun's have
5018 * this problem so I'm ignoring it
5019 */
5020
5021 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
5022 {
5023 kfree_skb(skb, FREE_READ);
5024 release_sock(sk);
5025 return 0;
5026 }
5027
5028 /*
5029 * Guess we need to make a new socket up
5030 */
5031
5032 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
5033
5034 /*
5035 * Now we have several options: In theory there is nothing else
5036 * in the frame. KA9Q has an option to send data with the syn,
5037 * BSD accepts data with the syn up to the [to be] advertised window
5038 * and Solaris 2.1 gives you a protocol error. For now we just ignore
5039 * it, that fits the spec precisely and avoids incompatibilities. It
5040 * would be nice in future to drop through and process the data.
5041 */
5042
5043 release_sock(sk);
5044 return 0;
5045 }
5046
5047 /* retransmitted SYN? */
5048 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
5049 {
5050 kfree_skb(skb, FREE_READ);
5051 release_sock(sk);
5052 return 0;
5053 }
5054
5055 /*
5056 * SYN sent means we have to look for a suitable ack and either reset
5057 * for bad matches or go to connected
5058 */
5059
5060 if(sk->state==TCP_SYN_SENT)
5061 {
5062 /* Crossed SYN or previous junk segment */
5063 if(th->ack)
5064 {
5065 /* We got an ack, but it's not a good ack */
5066 if(!tcp_ack(sk,th,saddr,len))
5067 {
5068 /* Reset the ack - its an ack from a
5069 different connection [ th->rst is checked in tcp_reset()] */
5070 tcp_statistics.TcpAttemptFails++;
5071 tcp_reset(daddr, saddr, th,
5072 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5073 kfree_skb(skb, FREE_READ);
5074 release_sock(sk);
5075 return(0);
5076 }
5077 if(th->rst)
5078 return tcp_std_reset(sk,skb);
5079 if(!th->syn)
5080 {
5081 /* A valid ack from a different connection
5082 start. Shouldn't happen but cover it */
5083 tcp_statistics.TcpAttemptFails++;
5084 tcp_reset(daddr, saddr, th,
5085 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5086 kfree_skb(skb, FREE_READ);
5087 release_sock(sk);
5088 return 0;
5089 }
5090 /*
5091 * Ok.. it's good. Set up sequence numbers and
5092 * move to established.
5093 */
5094 syn_ok=1; /* Don't reset this connection for the syn */
5095 sk->acked_seq = skb->seq+1;
5096 sk->lastwin_seq = skb->seq+1;
5097 sk->fin_seq = skb->seq;
5098 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5099 tcp_set_state(sk, TCP_ESTABLISHED);
5100 tcp_options(sk,th);
5101 sk->dummy_th.dest=th->source;
5102 sk->copied_seq = sk->acked_seq;
5103 if(!sk->dead)
5104 {
5105 sk->state_change(sk);
5106 sock_wake_async(sk->socket, 0);
5107 }
5108 if(sk->max_window==0)
5109 {
5110 sk->max_window = 32;
5111 sk->mss = min(sk->max_window, sk->mtu);
5112 }
5113 }
5114 else
5115 {
5116 /* See if SYN's cross. Drop if boring */
5117 if(th->syn && !th->rst)
5118 {
5119 /* Crossed SYN's are fine - but talking to
5120 yourself is right out... */
5121 if(sk->saddr==saddr && sk->daddr==daddr &&
5122 sk->dummy_th.source==th->source &&
5123 sk->dummy_th.dest==th->dest)
5124 {
5125 tcp_statistics.TcpAttemptFails++;
5126 return tcp_std_reset(sk,skb);
5127 }
5128 tcp_set_state(sk,TCP_SYN_RECV);
5129
5130 /*
5131 * FIXME:
5132 * Must send SYN|ACK here
5133 */
5134 }
5135 /* Discard junk segment */
5136 kfree_skb(skb, FREE_READ);
5137 release_sock(sk);
5138 return 0;
5139 }
5140 /*
5141 * SYN_RECV with data maybe.. drop through
5142 */
5143 goto rfc_step6;
5144 }
5145
5146 /*
5147 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5148 * a more complex suggestion for fixing these reuse issues in RFC1644
5149 * but not yet ready for general use. Also see RFC1379.
5150 */
5151
5152 #define BSD_TIME_WAIT
5153 #ifdef BSD_TIME_WAIT
5154 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5155 after(skb->seq, sk->acked_seq) && !th->rst)
5156 {
5157 u32 seq = sk->write_seq;
5158 if(sk->debug)
5159 printk("Doing a BSD time wait\n");
5160 tcp_statistics.TcpEstabResets++;
5161 sk->rmem_alloc -= skb->truesize;
5162 skb->sk = NULL;
5163 sk->err=ECONNRESET;
5164 tcp_set_state(sk, TCP_CLOSE);
5165 sk->shutdown = SHUTDOWN_MASK;
5166 release_sock(sk);
5167 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5168 if (sk && sk->state==TCP_LISTEN)
5169 {
5170 sk->inuse=1;
5171 skb->sk = sk;
5172 sk->rmem_alloc += skb->truesize;
5173 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5174 release_sock(sk);
5175 return 0;
5176 }
5177 kfree_skb(skb, FREE_READ);
5178 return 0;
5179 }
5180 #endif
5181 }
5182
5183 /*
5184 * We are now in normal data flow (see the step list in the RFC)
5185 * Note most of these are inline now. I'll inline the lot when
5186 * I have time to test it hard and look at what gcc outputs
5187 */
5188
5189 if (!tcp_sequence(sk, skb->seq, skb->end_seq))
5190 {
5191 bad_tcp_sequence(sk, th, len, opt, saddr, dev);
5192 kfree_skb(skb, FREE_READ);
5193 release_sock(sk);
5194 return 0;
5195 }
5196
5197 if(th->rst)
5198 return tcp_std_reset(sk,skb);
5199
5200 /*
5201 * !syn_ok is effectively the state test in RFC793.
5202 */
5203
5204 if(th->syn && !syn_ok)
5205 {
5206 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5207 return tcp_std_reset(sk,skb);
5208 }
5209
5210
5211 /*
5212 * Delayed ACK time estimator.
5213 */
5214
5215 if (sk->lrcvtime == 0)
5216 {
5217 sk->lrcvtime = jiffies;
5218 sk->ato = HZ/3;
5219 }
5220 else
5221 {
5222 int m;
5223
5224 m = jiffies - sk->lrcvtime;
5225
5226 sk->lrcvtime = jiffies;
5227
5228 if (m <= 0)
5229 m = 1;
5230
5231 if (m > (sk->rtt >> 3))
5232 {
5233 sk->ato = sk->rtt >> 3;
5234 /*
5235 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
5236 */
5237 }
5238 else
5239 {
5240 sk->ato = (sk->ato >> 1) + m;
5241 /*
5242 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
5243 */
5244 }
5245 }
5246
5247 /*
5248 * Process the ACK
5249 */
5250
5251
5252 if(th->ack && !tcp_ack(sk,th,saddr,len))
5253 {
5254 /*
5255 * Our three way handshake failed.
5256 */
5257
5258 if(sk->state==TCP_SYN_RECV)
5259 {
5260 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5261 }
5262 kfree_skb(skb, FREE_READ);
5263 release_sock(sk);
5264 return 0;
5265 }
5266
5267 rfc_step6: /* I'll clean this up later */
5268
5269 /*
5270 * If the accepted buffer put us over our queue size we
5271 * now drop it (we must process the ack first to avoid
5272 * deadlock cases).
5273 */
5274
5275 if (sk->rmem_alloc >= sk->rcvbuf)
5276 {
5277 kfree_skb(skb, FREE_READ);
5278 release_sock(sk);
5279 return(0);
5280 }
5281
5282
5283 /*
5284 * Process urgent data
5285 */
5286
5287 if(tcp_urg(sk, th, saddr, len))
5288 {
5289 kfree_skb(skb, FREE_READ);
5290 release_sock(sk);
5291 return 0;
5292 }
5293
5294 /*
5295 * Process the encapsulated data
5296 */
5297
5298 if(tcp_data(skb,sk, saddr, len))
5299 {
5300 kfree_skb(skb, FREE_READ);
5301 release_sock(sk);
5302 return 0;
5303 }
5304
5305 /*
5306 * And done
5307 */
5308
5309 release_sock(sk);
5310 return 0;
5311
5312 no_tcp_socket:
5313 /*
5314 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
5315 */
5316 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
5317 skb->sk = NULL;
5318 /*
5319 * Discard frame
5320 */
5321 kfree_skb(skb, FREE_READ);
5322 return 0;
5323 }
5324
5325 /*
5326 * This routine sends a packet with an out of date sequence
5327 * number. It assumes the other end will try to ack it.
5328 */
5329
5330 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5331 {
5332 struct sk_buff *buff,*skb;
5333 struct tcphdr *t1;
5334 struct device *dev=NULL;
5335 int tmp;
5336
5337 if (sk->zapped)
5338 return; /* After a valid reset we can send no more */
5339
5340 /*
5341 * Write data can still be transmitted/retransmitted in the
5342 * following states. If any other state is encountered, return.
5343 * [listen/close will never occur here anyway]
5344 */
5345
5346 if (sk->state != TCP_ESTABLISHED &&
5347 sk->state != TCP_CLOSE_WAIT &&
5348 sk->state != TCP_FIN_WAIT1 &&
5349 sk->state != TCP_LAST_ACK &&
5350 sk->state != TCP_CLOSING
5351 )
5352 {
5353 return;
5354 }
5355 if ( before(sk->sent_seq, sk->window_seq) &&
5356 (skb=skb_peek(&sk->write_queue)))
5357 {
5358 /*
5359 * We are probing the opening of a window
5360 * but the window size is != 0
5361 * must have been a result SWS advoidance ( sender )
5362 */
5363
5364 struct iphdr *iph;
5365 struct tcphdr *th;
5366 struct tcphdr *nth;
5367 unsigned long win_size;
5368 #if 0
5369 unsigned long ow_size;
5370 #endif
5371 void * tcp_data_start;
5372
5373 /*
5374 * How many bytes can we send ?
5375 */
5376
5377 win_size = sk->window_seq - sk->sent_seq;
5378
5379 /*
5380 * Recover the buffer pointers
5381 */
5382
5383 iph = (struct iphdr *)skb->ip_hdr;
5384 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5385
5386 /*
5387 * Grab the data for a temporary frame
5388 */
5389
5390 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5391 (iph->ihl << 2) +
5392 sk->prot->max_header + 15,
5393 1, GFP_ATOMIC);
5394 if ( buff == NULL )
5395 return;
5396
5397 /*
5398 * If we strip the packet on the write queue we must
5399 * be ready to retransmit this one
5400 */
5401
5402 buff->free = /*0*/1;
5403
5404 buff->sk = sk;
5405 buff->localroute = sk->localroute;
5406
5407 /*
5408 * Put headers on the new packet
5409 */
5410
5411 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5412 IPPROTO_TCP, sk->opt, buff->truesize,
5413 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5414 if (tmp < 0)
5415 {
5416 sock_wfree(sk, buff);
5417 return;
5418 }
5419
5420 /*
5421 * Move the TCP header over
5422 */
5423
5424 buff->dev = dev;
5425
5426 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5427
5428 memcpy(nth, th, th->doff * 4);
5429
5430 /*
5431 * Correct the new header
5432 */
5433
5434 nth->ack = 1;
5435 nth->ack_seq = htonl(sk->acked_seq);
5436 nth->window = htons(tcp_select_window(sk));
5437 nth->check = 0;
5438
5439 /*
5440 * Find the first data byte.
5441 */
5442
5443 tcp_data_start = (char *) th + (th->doff << 2);
5444
5445 /*
5446 * Add it to our new buffer
5447 */
5448
5449 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5450
5451 /*
5452 * Remember our right edge sequence number.
5453 */
5454
5455 buff->end_seq = sk->sent_seq + win_size;
5456 sk->sent_seq = buff->end_seq; /* Hack */
5457 if(th->urg && ntohs(th->urg_ptr) < win_size)
5458 nth->urg = 0;
5459
5460 /*
5461 * Checksum the split buffer
5462 */
5463
5464 tcp_send_check(nth, sk->saddr, sk->daddr,
5465 nth->doff * 4 + win_size , sk);
5466 }
5467 else
5468 {
5469 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5470 if (buff == NULL)
5471 return;
5472
5473 buff->free = 1;
5474 buff->sk = sk;
5475 buff->localroute = sk->localroute;
5476
5477 /*
5478 * Put in the IP header and routing stuff.
5479 */
5480
5481 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5482 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5483 if (tmp < 0)
5484 {
5485 sock_wfree(sk, buff);
5486 return;
5487 }
5488
5489 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5490 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5491
5492 /*
5493 * Use a previous sequence.
5494 * This should cause the other end to send an ack.
5495 */
5496
5497 t1->seq = htonl(sk->sent_seq-1);
5498 t1->ack = 1;
5499 t1->res1= 0;
5500 t1->res2= 0;
5501 t1->rst = 0;
5502 t1->urg = 0;
5503 t1->psh = 0;
5504 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5505 t1->syn = 0;
5506 t1->ack_seq = htonl(sk->acked_seq);
5507 t1->window = htons(tcp_select_window(sk));
5508 t1->doff = sizeof(*t1)/4;
5509 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5510
5511 }
5512
5513 /*
5514 * Send it.
5515 */
5516
5517 sk->prot->queue_xmit(sk, dev, buff, 1);
5518 tcp_statistics.TcpOutSegs++;
5519 }
5520
5521 /*
5522 * A window probe timeout has occurred.
5523 */
5524
5525 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5526 {
5527 if (sk->zapped)
5528 return; /* After a valid reset we can send no more */
5529
5530 tcp_write_wakeup(sk);
5531
5532 sk->backoff++;
5533 sk->rto = min(sk->rto << 1, 120*HZ);
5534 sk->retransmits++;
5535 sk->prot->retransmits ++;
5536 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5537 }
5538
5539 /*
5540 * Socket option code for TCP.
5541 */
5542
5543 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5544 {
5545 int val,err;
5546
5547 if(level!=SOL_TCP)
5548 return ip_setsockopt(sk,level,optname,optval,optlen);
5549
5550 if (optval == NULL)
5551 return(-EINVAL);
5552
5553 err=verify_area(VERIFY_READ, optval, sizeof(int));
5554 if(err)
5555 return err;
5556
5557 val = get_user((int *)optval);
5558
5559 switch(optname)
5560 {
5561 case TCP_MAXSEG:
5562 /*
5563 * values greater than interface MTU won't take effect. however at
5564 * the point when this call is done we typically don't yet know
5565 * which interface is going to be used
5566 */
5567 if(val<1||val>MAX_WINDOW)
5568 return -EINVAL;
5569 sk->user_mss=val;
5570 return 0;
5571 case TCP_NODELAY:
5572 sk->nonagle=(val==0)?0:1;
5573 return 0;
5574 default:
5575 return(-ENOPROTOOPT);
5576 }
5577 }
5578
5579 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5580 {
5581 int val,err;
5582
5583 if(level!=SOL_TCP)
5584 return ip_getsockopt(sk,level,optname,optval,optlen);
5585
5586 switch(optname)
5587 {
5588 case TCP_MAXSEG:
5589 val=sk->user_mss;
5590 break;
5591 case TCP_NODELAY:
5592 val=sk->nonagle;
5593 break;
5594 default:
5595 return(-ENOPROTOOPT);
5596 }
5597 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5598 if(err)
5599 return err;
5600 put_user(sizeof(int),(int *) optlen);
5601
5602 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5603 if(err)
5604 return err;
5605 put_user(val,(int *)optval);
5606
5607 return(0);
5608 }
5609
5610
5611 struct proto tcp_prot = {
5612 tcp_close,
5613 ip_build_header,
5614 tcp_connect,
5615 tcp_accept,
5616 ip_queue_xmit,
5617 tcp_retransmit,
5618 tcp_write_wakeup,
5619 tcp_read_wakeup,
5620 tcp_rcv,
5621 tcp_select,
5622 tcp_ioctl,
5623 NULL,
5624 tcp_shutdown,
5625 tcp_setsockopt,
5626 tcp_getsockopt,
5627 tcp_sendmsg,
5628 tcp_recvmsg,
5629 NULL, /* No special bind() */
5630 128,
5631 0,
5632 "TCP",
5633 0, 0,
5634 {NULL,}
5635 };