1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. select
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle select() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), select() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in selecting before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : Select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if stat is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 *
183 *
184 * To Fix:
185 * Fast path the code. Two things here - fix the window calculation
186 * so it doesn't iterate over the queue, also spot packets with no funny
187 * options arriving in order and process directly.
188 *
189 * Rewrite output state machine to use a single queue and do low window
190 * situations as per the spec (RFC 1122)
191 * Speed up input assembly algorithm.
192 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
193 * could do with it working on IPv4
194 * User settable/learned rtt/max window/mtu
195 * Cope with MTU/device switches when retransmitting in tcp.
196 * Fix the window handling to use PR's new code.
197 *
198 * Change the fundamental structure to a single send queue maintained
199 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
200 * active routes too]). Cut the queue off in tcp_retransmit/
201 * tcp_transmit.
202 * Change the receive queue to assemble as it goes. This lets us
203 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
204 * tcp_data/tcp_read as well as the window shrink crud.
205 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
206 * tcp_queue_skb seem obvious routines to extract.
207 *
208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version.
212 *
213 * Description of States:
214 *
215 * TCP_SYN_SENT sent a connection request, waiting for ack
216 *
217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake.
219 *
220 * TCP_ESTABLISHED connection established
221 *
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data
224 *
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * to shutdown
227 *
228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending
230 *
231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore)
236 *
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK)
240 *
241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending
244 *
245 * TCP_CLOSE socket is finished
246 */
247
248 /*
249 * RFC1122 status:
250 * NOTE: I'm not going to be doing comments in the code for this one except
251 * for violations and the like. tcp.c is just too big... If I say something
252 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
253 * with Alan. -- MS 950903
254 *
255 * Use of PSH (4.2.2.2)
256 * MAY aggregate data sent without the PSH flag. (does)
257 * MAY queue data recieved without the PSH flag. (does)
258 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
259 * MAY implement PSH on send calls. (doesn't, thus:)
260 * MUST NOT buffer data indefinitely (doesn't [1 second])
261 * MUST set PSH on last segment (does)
262 * MAY pass received PSH to application layer (doesn't)
263 * SHOULD send maximum-sized segment whenever possible. (almost always does)
264 *
265 * Window Size (4.2.2.3, 4.2.2.16)
266 * MUST treat window size as an unsigned number (does)
267 * SHOULD treat window size as a 32-bit number (does not)
268 * MUST NOT shrink window once it is offered (does not normally)
269 *
270 * Urgent Pointer (4.2.2.4)
271 * **MUST point urgent pointer to last byte of urgent data (not right
272 * after). (doesn't, to be like BSD)
273 * MUST inform application layer asynchronously of incoming urgent
274 * data. (does)
275 * MUST provide application with means of determining the amount of
276 * urgent data pending. (does)
277 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
278 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
279 * [Follows BSD 1 byte of urgent data]
280 *
281 * TCP Options (4.2.2.5)
282 * MUST be able to recieve TCP options in any segment. (does)
283 * MUST ignore unsupported options (does)
284 *
285 * Maximum Segment Size Option (4.2.2.6)
286 * MUST implement both sending and receiving MSS. (does)
287 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
288 * it always). (does, even when MSS == 536, which is legal)
289 * MUST assume MSS == 536 if no MSS received at connection setup (does)
290 * MUST calculate "effective send MSS" correctly:
291 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
292 * (does - but allows operator override)
293 *
294 * TCP Checksum (4.2.2.7)
295 * MUST generate and check TCP checksum. (does)
296 *
297 * Initial Sequence Number Selection (4.2.2.8)
298 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
299 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
300 * necessary for 10Mbps networks - and harder than BSD to spoof!)
301 *
302 * Simultaneous Open Attempts (4.2.2.10)
303 * MUST support simultaneous open attempts (does)
304 *
305 * Recovery from Old Duplicate SYN (4.2.2.11)
306 * MUST keep track of active vs. passive open (does)
307 *
308 * RST segment (4.2.2.12)
309 * SHOULD allow an RST segment to contain data (does, but doesn't do
310 * anything with it, which is standard)
311 *
312 * Closing a Connection (4.2.2.13)
313 * MUST inform application of whether connectin was closed by RST or
314 * normal close. (does)
315 * MAY allow "half-duplex" close (treat connection as closed for the
316 * local app, even before handshake is done). (does)
317 * MUST linger in TIME_WAIT for 2 * MSL (does)
318 *
319 * Retransmission Timeout (4.2.2.15)
320 * MUST implement Jacobson's slow start and congestion avoidance
321 * stuff. (does)
322 *
323 * Probing Zero Windows (4.2.2.17)
324 * MUST support probing of zero windows. (does)
325 * MAY keep offered window closed indefinitely. (does)
326 * MUST allow remote window to stay closed indefinitely. (does)
327 *
328 * Passive Open Calls (4.2.2.18)
329 * MUST NOT let new passive open affect other connections. (doesn't)
330 * MUST support passive opens (LISTENs) concurrently. (does)
331 *
332 * Time to Live (4.2.2.19)
333 * MUST make TCP TTL configurable. (does - IP_TTL option)
334 *
335 * Event Processing (4.2.2.20)
336 * SHOULD queue out-of-order segments. (does)
337 * MUST aggregate ACK segments whenever possible. (does but badly)
338 *
339 * Retransmission Timeout Calculation (4.2.3.1)
340 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
341 * calculation. (does, or at least explains them in the comments 8*b)
342 * SHOULD initialize RTO to 0 and RTT to 3. (does)
343 *
344 * When to Send an ACK Segment (4.2.3.2)
345 * SHOULD implement delayed ACK. (does not)
346 * MUST keep ACK delay < 0.5 sec. (N/A)
347 *
348 * When to Send a Window Update (4.2.3.3)
349 * MUST implement receiver-side SWS. (does)
350 *
351 * When to Send Data (4.2.3.4)
352 * MUST implement sender-side SWS. (does - imperfectly)
353 * SHOULD implement Nagle algorithm. (does)
354 *
355 * TCP Connection Failures (4.2.3.5)
356 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
357 * SHOULD inform application layer of soft errors. (doesn't)
358 *
359 * TCP Keep-Alives (4.2.3.6)
360 * MAY provide keep-alives. (does)
361 * MUST make keep-alives configurable on a per-connection basis. (does)
362 * MUST default to no keep-alives. (does)
363 * **MUST make keep-alive interval configurable. (doesn't)
364 * **MUST make default keep-alive interval > 2 hours. (doesn't)
365 * MUST NOT interpret failure to ACK keep-alive packet as dead
366 * connection. (doesn't)
367 * SHOULD send keep-alive with no data. (does)
368 *
369 * TCP Multihoming (4.2.3.7)
370 * MUST get source address from IP layer before sending first
371 * SYN. (does)
372 * MUST use same local address for all segments of a connection. (does)
373 *
374 * IP Options (4.2.3.8)
375 * (I don't think the IP layer sees the IP options, yet.)
376 * MUST ignore unsupported IP options. (does, I guess 8*b)
377 * MAY support Time Stamp and Record Route. (doesn't)
378 * **MUST allow application to specify a source route. (doesn't?)
379 * **MUST allow receieved Source Route option to set route for all future
380 * segments on this connection. (doesn't, not that I think it's a
381 * huge problem)
382 *
383 * ICMP messages (4.2.3.9)
384 * MUST act on ICMP errors. (does)
385 * MUST slow transmission upon receipt of a Source Quench. (does)
386 * MUST NOT abort connection upon receipt of soft Destination
387 * Unreachables (0, 1, 5), Time Exceededs and Parameter
388 * Problems. (doesn't)
389 * SHOULD report soft Destination Unreachables etc. to the
390 * application. (doesn't)
391 * SHOULD abort connection upon receipt of hard Destination Unreachable
392 * messages (2, 3, 4). (does)
393 *
394 * Remote Address Validation (4.2.3.10)
395 * MUST reject as an error OPEN for invalid remote IP address. (does)
396 * MUST ignore SYN with invalid source address. (does)
397 * MUST silently discard incoming SYN for broadcast/multicast
398 * address. (does)
399 *
400 * Asynchronous Reports (4.2.4.1)
401 * **MUST provide mechanism for reporting soft errors to application
402 * layer. (doesn't)
403 *
404 * Type of Service (4.2.4.2)
405 * MUST allow application layer to set Type of Service. (does IP_TOS)
406 *
407 * (Whew. -- MS 950903)
408 **/
409
410 #include <linux/types.h>
411 #include <linux/sched.h>
412 #include <linux/mm.h>
413 #include <linux/time.h>
414 #include <linux/string.h>
415 #include <linux/config.h>
416 #include <linux/socket.h>
417 #include <linux/sockios.h>
418 #include <linux/termios.h>
419 #include <linux/in.h>
420 #include <linux/fcntl.h>
421 #include <linux/inet.h>
422 #include <linux/netdevice.h>
423 #include <net/snmp.h>
424 #include <net/ip.h>
425 #include <net/protocol.h>
426 #include <net/icmp.h>
427 #include <net/tcp.h>
428 #include <net/arp.h>
429 #include <linux/skbuff.h>
430 #include <net/sock.h>
431 #include <net/route.h>
432 #include <linux/errno.h>
433 #include <linux/timer.h>
434 #include <asm/system.h>
435 #include <asm/segment.h>
436 #include <linux/mm.h>
437 #include <net/checksum.h>
438
439 /*
440 * The MSL timer is the 'normal' timer.
441 */
442
443 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
444
445 #define SEQ_TICK 3
446 unsigned long seq_offset;
447 struct tcp_mib tcp_statistics;
448
449 /*
450 * Cached last hit socket
451 */
452
453 volatile unsigned long th_cache_saddr,th_cache_daddr;
454 volatile unsigned short th_cache_dport, th_cache_sport;
455 volatile struct sock *th_cache_sk;
456
457 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
458 {
459 unsigned long flags;
460 save_flags(flags);
461 cli();
462 th_cache_saddr=0;
463 th_cache_daddr=0;
464 th_cache_dport=0;
465 th_cache_sport=0;
466 th_cache_sk=NULL;
467 restore_flags(flags);
468 }
469
470 static void tcp_close(struct sock *sk, int timeout);
471
472
473 /*
474 * The less said about this the better, but it works and will do for 1.2
475 */
476
477 static struct wait_queue *master_select_wakeup;
478
479 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
480 {
481 if (a < b)
482 return(a);
483 return(b);
484 }
485
486 #undef STATE_TRACE
487
488 #ifdef STATE_TRACE
489 static char *statename[]={
490 "Unused","Established","Syn Sent","Syn Recv",
491 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
492 "Close Wait","Last ACK","Listen","Closing"
493 };
494 #endif
495
496 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
497 {
498 if(sk->state==TCP_ESTABLISHED)
499 tcp_statistics.TcpCurrEstab--;
500 #ifdef STATE_TRACE
501 if(sk->debug)
502 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
503 #endif
504 /* This is a hack but it doesn't occur often and it's going to
505 be a real to fix nicely */
506
507 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
508 {
509 wake_up_interruptible(&master_select_wakeup);
510 }
511 sk->state=state;
512 if(state==TCP_ESTABLISHED)
513 tcp_statistics.TcpCurrEstab++;
514 if(sk->state==TCP_CLOSE)
515 tcp_cache_zap();
516 }
517
518 /*
519 * This routine picks a TCP windows for a socket based on
520 * the following constraints
521 *
522 * 1. The window can never be shrunk once it is offered (RFC 793)
523 * 2. We limit memory per socket
524 *
525 * For now we use NET2E3's heuristic of offering half the memory
526 * we have handy. All is not as bad as this seems however because
527 * of two things. Firstly we will bin packets even within the window
528 * in order to get the data we are waiting for into the memory limit.
529 * Secondly we bin common duplicate forms at receive time
530 * Better heuristics welcome
531 */
532
533 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
534 {
535 int new_window = sock_rspace(sk);
536
537 if(sk->window_clamp)
538 new_window=min(sk->window_clamp,new_window);
539 /*
540 * Two things are going on here. First, we don't ever offer a
541 * window less than min(sk->mss, MAX_WINDOW/2). This is the
542 * receiver side of SWS as specified in RFC1122.
543 * Second, we always give them at least the window they
544 * had before, in order to avoid retracting window. This
545 * is technically allowed, but RFC1122 advises against it and
546 * in practice it causes trouble.
547 *
548 * Fixme: This doesn't correctly handle the case where
549 * new_window > sk->window but not by enough to allow for the
550 * shift in sequence space.
551 */
552 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
553 return(sk->window);
554 return(new_window);
555 }
556
557 /*
558 * Find someone to 'accept'. Must be called with
559 * sk->inuse=1 or cli()
560 */
561
562 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
563 {
564 struct sk_buff *p=skb_peek(&s->receive_queue);
565 if(p==NULL)
566 return NULL;
567 do
568 {
569 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
570 return p;
571 p=p->next;
572 }
573 while(p!=(struct sk_buff *)&s->receive_queue);
574 return NULL;
575 }
576
577 /*
578 * Remove a completed connection and return it. This is used by
579 * tcp_accept() to get connections from the queue.
580 */
581
582 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
583 {
584 struct sk_buff *skb;
585 unsigned long flags;
586 save_flags(flags);
587 cli();
588 skb=tcp_find_established(s);
589 if(skb!=NULL)
590 skb_unlink(skb); /* Take it off the queue */
591 restore_flags(flags);
592 return skb;
593 }
594
595 /*
596 * This routine closes sockets which have been at least partially
597 * opened, but not yet accepted. Currently it is only called by
598 * tcp_close, and timeout mirrors the value there.
599 */
600
601 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
602 {
603 struct sk_buff *skb;
604
605 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
606 {
607 skb->sk->dead=1;
608 tcp_close(skb->sk, 0);
609 kfree_skb(skb, FREE_READ);
610 }
611 return;
612 }
613
614 /*
615 * Enter the time wait state.
616 */
617
618 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
619 {
620 tcp_set_state(sk,TCP_TIME_WAIT);
621 sk->shutdown = SHUTDOWN_MASK;
622 if (!sk->dead)
623 sk->state_change(sk);
624 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
625 }
626
627 /*
628 * A socket has timed out on its send queue and wants to do a
629 * little retransmitting. Currently this means TCP.
630 */
631
632 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
633 {
634 struct sk_buff * skb;
635 struct proto *prot;
636 struct device *dev;
637 int ct=0;
638 struct rtable *rt;
639
640 prot = sk->prot;
641 skb = sk->send_head;
642
643 while (skb != NULL)
644 {
645 struct tcphdr *th;
646 struct iphdr *iph;
647 int size;
648
649 dev = skb->dev;
650 IS_SKB(skb);
651 skb->when = jiffies;
652
653 /*
654 * Discard the surplus MAC header
655 */
656
657 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
658
659 /*
660 * In general it's OK just to use the old packet. However we
661 * need to use the current ack and window fields. Urg and
662 * urg_ptr could possibly stand to be updated as well, but we
663 * don't keep the necessary data. That shouldn't be a problem,
664 * if the other end is doing the right thing. Since we're
665 * changing the packet, we have to issue a new IP identifier.
666 */
667
668 iph = (struct iphdr *)skb->data;
669 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
670 size = ntohs(iph->tot_len) - (iph->ihl<<2);
671
672 /*
673 * Note: We ought to check for window limits here but
674 * currently this is done (less efficiently) elsewhere.
675 */
676
677 /*
678 * Put a MAC header back on (may cause ARPing)
679 */
680
681 {
682 /* ANK: UGLY, but the bug, that was here, should be fixed.
683 */
684 struct options * opt = (struct options*)skb->proto_priv;
685 rt = ip_check_route(&sk->ip_route_cache, opt->srr?opt->faddr:iph->daddr, skb->localroute);
686 }
687
688 iph->id = htons(ip_id_count++);
689 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
690 if (rt && ntohs(iph->tot_len) > rt->rt_mtu)
691 iph->frag_off &= ~htons(IP_DF);
692 #endif
693 ip_send_check(iph);
694
695 if (rt==NULL) /* Deep poo */
696 {
697 if(skb->sk)
698 {
699 skb->sk->err=ENETUNREACH;
700 skb->sk->error_report(skb->sk);
701 }
702 }
703 else
704 {
705 dev=rt->rt_dev;
706 skb->raddr=rt->rt_gateway;
707 skb->dev=dev;
708 skb->arp=1;
709 if (rt->rt_hh)
710 {
711 memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len);
712 if (!rt->rt_hh->hh_uptodate)
713 {
714 skb->arp = 0;
715 #if RT_CACHE_DEBUG >= 2
716 printk("tcp_do_retransmit: hh miss %08x via %08x\n", iph->daddr, rt->rt_gateway);
717 #endif
718 }
719 }
720 else if (dev->hard_header)
721 {
722 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
723 skb->arp=0;
724 }
725
726 /*
727 * This is not the right way to handle this. We have to
728 * issue an up to date window and ack report with this
729 * retransmit to keep the odd buggy tcp that relies on
730 * the fact BSD does this happy.
731 * We don't however need to recalculate the entire
732 * checksum, so someone wanting a small problem to play
733 * with might like to implement RFC1141/RFC1624 and speed
734 * this up by avoiding a full checksum.
735 */
736
737 th->ack_seq = htonl(sk->acked_seq);
738 th->window = ntohs(tcp_select_window(sk));
739 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
740
741 /*
742 * If the interface is (still) up and running, kick it.
743 */
744
745 if (dev->flags & IFF_UP)
746 {
747 /*
748 * If the packet is still being sent by the device/protocol
749 * below then don't retransmit. This is both needed, and good -
750 * especially with connected mode AX.25 where it stops resends
751 * occurring of an as yet unsent anyway frame!
752 * We still add up the counts as the round trip time wants
753 * adjusting.
754 */
755 if (sk && !skb_device_locked(skb))
756 {
757 /* Remove it from any existing driver queue first! */
758 skb_unlink(skb);
759 /* Now queue it */
760 ip_statistics.IpOutRequests++;
761 dev_queue_xmit(skb, dev, sk->priority);
762 }
763 }
764 }
765
766 /*
767 * Count retransmissions
768 */
769
770 ct++;
771 sk->prot->retransmits ++;
772 tcp_statistics.TcpRetransSegs++;
773
774
775 /*
776 * Only one retransmit requested.
777 */
778
779 if (!all)
780 break;
781
782 /*
783 * This should cut it off before we send too many packets.
784 */
785
786 if (ct >= sk->cong_window)
787 break;
788 skb = skb->link3;
789 }
790 }
791
792 /*
793 * Reset the retransmission timer
794 */
795
796 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
797 {
798 del_timer(&sk->retransmit_timer);
799 sk->ip_xmit_timeout = why;
800 if((int)when < 0)
801 {
802 when=3;
803 printk("Error: Negative timer in xmit_timer\n");
804 }
805 sk->retransmit_timer.expires=jiffies+when;
806 add_timer(&sk->retransmit_timer);
807 }
808
809 /*
810 * This is the normal code called for timeouts. It does the retransmission
811 * and then does backoff. tcp_do_retransmit is separated out because
812 * tcp_ack needs to send stuff from the retransmit queue without
813 * initiating a backoff.
814 */
815
816
817 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
818 {
819 tcp_do_retransmit(sk, all);
820
821 /*
822 * Increase the timeout each time we retransmit. Note that
823 * we do not increase the rtt estimate. rto is initialized
824 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
825 * that doubling rto each time is the least we can get away with.
826 * In KA9Q, Karn uses this for the first few times, and then
827 * goes to quadratic. netBSD doubles, but only goes up to *64,
828 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
829 * defined in the protocol as the maximum possible RTT. I guess
830 * we'll have to use something other than TCP to talk to the
831 * University of Mars.
832 *
833 * PAWS allows us longer timeouts and large windows, so once
834 * implemented ftp to mars will work nicely. We will have to fix
835 * the 120 second clamps though!
836 */
837
838 sk->retransmits++;
839 sk->prot->retransmits++;
840 sk->backoff++;
841 sk->rto = min(sk->rto << 1, 120*HZ);
842 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
843 }
844
845
846 /*
847 * A timer event has trigger a tcp retransmit timeout. The
848 * socket xmit queue is ready and set up to send. Because
849 * the ack receive code keeps the queue straight we do
850 * nothing clever here.
851 */
852
853 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
854 {
855 if (all)
856 {
857 tcp_retransmit_time(sk, all);
858 return;
859 }
860
861 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
862 /* sk->ssthresh in theory can be zero. I guess that's OK */
863 sk->cong_count = 0;
864
865 sk->cong_window = 1;
866
867 /* Do the actual retransmit. */
868 tcp_retransmit_time(sk, all);
869 }
870
871 /*
872 * A write timeout has occurred. Process the after effects.
873 */
874
875 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
876 {
877 /*
878 * Look for a 'soft' timeout.
879 */
880 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
881 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
882 {
883 /*
884 * Attempt to recover if arp has changed (unlikely!) or
885 * a route has shifted (not supported prior to 1.3).
886 */
887 ip_rt_advice(&sk->ip_route_cache, 0);
888 }
889
890 /*
891 * Have we tried to SYN too many times (repent repent 8))
892 */
893
894 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
895 {
896 sk->err=ETIMEDOUT;
897 sk->error_report(sk);
898 del_timer(&sk->retransmit_timer);
899 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
900 tcp_set_state(sk,TCP_CLOSE);
901 /* Don't FIN, we got nothing back */
902 release_sock(sk);
903 return 0;
904 }
905 /*
906 * Has it gone just too far ?
907 */
908 if (sk->retransmits > TCP_RETR2)
909 {
910 sk->err = ETIMEDOUT;
911 sk->error_report(sk);
912 del_timer(&sk->retransmit_timer);
913 /*
914 * Time wait the socket
915 */
916 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
917 {
918 tcp_set_state(sk,TCP_TIME_WAIT);
919 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
920 }
921 else
922 {
923 /*
924 * Clean up time.
925 */
926 tcp_set_state(sk, TCP_CLOSE);
927 release_sock(sk);
928 return 0;
929 }
930 }
931 return 1;
932 }
933
934 /*
935 * The TCP retransmit timer. This lacks a few small details.
936 *
937 * 1. An initial rtt timeout on the probe0 should cause what we can
938 * of the first write queue buffer to be split and sent.
939 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
940 * ETIMEDOUT if we know an additional 'soft' error caused this.
941 * tcp_err should save a 'soft error' for us.
942 */
943
944 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
945 {
946 struct sock *sk = (struct sock*)data;
947 int why = sk->ip_xmit_timeout;
948
949 /*
950 * only process if socket is not in use
951 */
952
953 cli();
954 if (sk->inuse || in_bh)
955 {
956 /* Try again in 1 second */
957 sk->retransmit_timer.expires = jiffies+HZ;
958 add_timer(&sk->retransmit_timer);
959 sti();
960 return;
961 }
962
963 sk->inuse = 1;
964 sti();
965
966 /* Always see if we need to send an ack. */
967
968 if (sk->ack_backlog && !sk->zapped)
969 {
970 sk->prot->read_wakeup (sk);
971 if (! sk->dead)
972 sk->data_ready(sk,0);
973 }
974
975 /* Now we need to figure out why the socket was on the timer. */
976
977 switch (why)
978 {
979 /* Window probing */
980 case TIME_PROBE0:
981 tcp_send_probe0(sk);
982 tcp_write_timeout(sk);
983 break;
984 /* Retransmitting */
985 case TIME_WRITE:
986 /* It could be we got here because we needed to send an ack.
987 * So we need to check for that.
988 */
989 {
990 struct sk_buff *skb;
991 unsigned long flags;
992
993 save_flags(flags);
994 cli();
995 skb = sk->send_head;
996 if (!skb)
997 {
998 restore_flags(flags);
999 }
1000 else
1001 {
1002 /*
1003 * Kicked by a delayed ack. Reset timer
1004 * correctly now
1005 */
1006 if (jiffies < skb->when + sk->rto)
1007 {
1008 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
1009 restore_flags(flags);
1010 break;
1011 }
1012 restore_flags(flags);
1013 /*
1014 * Retransmission
1015 */
1016 sk->retransmits++;
1017 sk->prot->retransmits++;
1018 sk->prot->retransmit (sk, 0);
1019 tcp_write_timeout(sk);
1020 }
1021 break;
1022 }
1023 /* Sending Keepalives */
1024 case TIME_KEEPOPEN:
1025 /*
1026 * this reset_timer() call is a hack, this is not
1027 * how KEEPOPEN is supposed to work.
1028 */
1029 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1030
1031 /* Send something to keep the connection open. */
1032 if (sk->prot->write_wakeup)
1033 sk->prot->write_wakeup (sk);
1034 sk->retransmits++;
1035 sk->prot->retransmits++;
1036 tcp_write_timeout(sk);
1037 break;
1038 default:
1039 printk ("rexmit_timer: timer expired - reason unknown\n");
1040 break;
1041 }
1042 release_sock(sk);
1043 }
1044
1045 /*
1046 * This routine is called by the ICMP module when it gets some
1047 * sort of error condition. If err < 0 then the socket should
1048 * be closed and the error returned to the user. If err > 0
1049 * it's just the icmp type << 8 | icmp code. After adjustment
1050 * header points to the first 8 bytes of the tcp header. We need
1051 * to find the appropriate port.
1052 */
1053
1054 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1055 __u32 saddr, struct inet_protocol *protocol)
1056 {
1057 struct tcphdr *th = (struct tcphdr *)header;
1058 struct sock *sk;
1059
1060 /*
1061 * This one is _WRONG_. FIXME urgently.
1062 */
1063 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1064 struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
1065 #endif
1066 th =(struct tcphdr *)header;
1067 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1068
1069 if (sk == NULL)
1070 return;
1071
1072 if (type == ICMP_SOURCE_QUENCH)
1073 {
1074 /*
1075 * FIXME:
1076 * For now we will just trigger a linear backoff.
1077 * The slow start code should cause a real backoff here.
1078 */
1079 if (sk->cong_window > 4)
1080 sk->cong_window--;
1081 return;
1082 }
1083
1084 if (type == ICMP_PARAMETERPROB)
1085 {
1086 sk->err=EPROTO;
1087 sk->error_report(sk);
1088 }
1089
1090 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1091 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
1092 {
1093 struct rtable * rt;
1094 /*
1095 * Ugly trick to pass MTU to protocol layer.
1096 * Really we should add argument "info" to error handler.
1097 */
1098 unsigned short new_mtu = ntohs(iph->id);
1099
1100 if ((rt = sk->ip_route_cache) != NULL)
1101 if (rt->rt_mtu > new_mtu)
1102 rt->rt_mtu = new_mtu;
1103
1104 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr))
1105 sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
1106
1107 return;
1108 }
1109 #endif
1110
1111 /*
1112 * If we've already connected we will keep trying
1113 * until we time out, or the user gives up.
1114 */
1115
1116 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1117 {
1118 sk->err = icmp_err_convert[code].errno;
1119 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1120 {
1121 tcp_statistics.TcpAttemptFails++;
1122 tcp_set_state(sk,TCP_CLOSE);
1123 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
1124 }
1125 }
1126 return;
1127 }
1128
1129
1130 /*
1131 * Walk down the receive queue counting readable data until we hit the end or we find a gap
1132 * in the received data queue (ie a frame missing that needs sending to us). Not
1133 * sorting using two queues as data arrives makes life so much harder.
1134 */
1135
1136 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1137 {
1138 unsigned long counted;
1139 unsigned long amount;
1140 struct sk_buff *skb;
1141 int sum;
1142 unsigned long flags;
1143
1144 if(sk && sk->debug)
1145 printk("tcp_readable: %p - ",sk);
1146
1147 save_flags(flags);
1148 cli();
1149 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1150 {
1151 restore_flags(flags);
1152 if(sk && sk->debug)
1153 printk("empty\n");
1154 return(0);
1155 }
1156
1157 counted = sk->copied_seq; /* Where we are at the moment */
1158 amount = 0;
1159
1160 /*
1161 * Do until a push or until we are out of data.
1162 */
1163
1164 do
1165 {
1166 if (before(counted, skb->seq)) /* Found a hole so stops here */
1167 break;
1168 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */
1169 if (skb->h.th->syn)
1170 sum++;
1171 if (sum > 0)
1172 { /* Add it up, move on */
1173 amount += sum;
1174 if (skb->h.th->syn)
1175 amount--;
1176 counted += sum;
1177 }
1178 /*
1179 * Don't count urg data ... but do it in the right place!
1180 * Consider: "old_data (ptr is here) URG PUSH data"
1181 * The old code would stop at the first push because
1182 * it counted the urg (amount==1) and then does amount--
1183 * *after* the loop. This means tcp_readable() always
1184 * returned zero if any URG PUSH was in the queue, even
1185 * though there was normal data available. If we subtract
1186 * the urg data right here, we even get it to work for more
1187 * than one URG PUSH skb without normal data.
1188 * This means that select() finally works now with urg data
1189 * in the queue. Note that rlogin was never affected
1190 * because it doesn't use select(); it uses two processes
1191 * and a blocking read(). And the queue scan in tcp_read()
1192 * was correct. Mike <pall@rz.uni-karlsruhe.de>
1193 */
1194 if (skb->h.th->urg)
1195 amount--; /* don't count urg data */
1196 if (amount && skb->h.th->psh) break;
1197 skb = skb->next;
1198 }
1199 while(skb != (struct sk_buff *)&sk->receive_queue);
1200
1201 restore_flags(flags);
1202 if(sk->debug)
1203 printk("got %lu bytes.\n",amount);
1204 return(amount);
1205 }
1206
1207 /*
1208 * LISTEN is a special case for select..
1209 */
1210 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1211 {
1212 if (sel_type == SEL_IN) {
1213 int retval;
1214
1215 sk->inuse = 1;
1216 retval = (tcp_find_established(sk) != NULL);
1217 release_sock(sk);
1218 if (!retval)
1219 select_wait(&master_select_wakeup,wait);
1220 return retval;
1221 }
1222 return 0;
1223 }
1224
1225
1226 /*
1227 * Wait for a TCP event.
1228 *
1229 * Note that we don't need to set "sk->inuse", as the upper select layers
1230 * take care of normal races (between the test and the event) and we don't
1231 * go look at any of the socket buffers directly.
1232 */
1233 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1234 {
1235 if (sk->state == TCP_LISTEN)
1236 return tcp_listen_select(sk, sel_type, wait);
1237
1238 switch(sel_type) {
1239 case SEL_IN:
1240 if (sk->err)
1241 return 1;
1242 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1243 break;
1244
1245 if (sk->shutdown & RCV_SHUTDOWN)
1246 return 1;
1247
1248 if (sk->acked_seq == sk->copied_seq)
1249 break;
1250
1251 if (sk->urg_seq != sk->copied_seq ||
1252 sk->acked_seq != sk->copied_seq+1 ||
1253 sk->urginline || !sk->urg_data)
1254 return 1;
1255 break;
1256
1257 case SEL_OUT:
1258 if (sk->err)
1259 return 1;
1260 if (sk->shutdown & SEND_SHUTDOWN)
1261 return 0;
1262 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1263 break;
1264 /*
1265 * This is now right thanks to a small fix
1266 * by Matt Dillon.
1267 */
1268
1269 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1270 break;
1271 return 1;
1272
1273 case SEL_EX:
1274 if (sk->urg_data)
1275 return 1;
1276 break;
1277 }
1278 select_wait(sk->sleep, wait);
1279 return 0;
1280 }
1281
1282 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1283 {
1284 int err;
1285 switch(cmd)
1286 {
1287
1288 case TIOCINQ:
1289 #ifdef FIXME /* FIXME: */
1290 case FIONREAD:
1291 #endif
1292 {
1293 unsigned long amount;
1294
1295 if (sk->state == TCP_LISTEN)
1296 return(-EINVAL);
1297
1298 sk->inuse = 1;
1299 amount = tcp_readable(sk);
1300 release_sock(sk);
1301 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1302 if(err)
1303 return err;
1304 put_user(amount, (int *)arg);
1305 return(0);
1306 }
1307 case SIOCATMARK:
1308 {
1309 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1310
1311 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1312 if (err)
1313 return err;
1314 put_user(answ,(int *) arg);
1315 return(0);
1316 }
1317 case TIOCOUTQ:
1318 {
1319 unsigned long amount;
1320
1321 if (sk->state == TCP_LISTEN) return(-EINVAL);
1322 amount = sock_wspace(sk);
1323 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1324 if(err)
1325 return err;
1326 put_user(amount, (int *)arg);
1327 return(0);
1328 }
1329 default:
1330 return(-EINVAL);
1331 }
1332 }
1333
1334
1335 /*
1336 * This routine computes a TCP checksum.
1337 *
1338 * Modified January 1995 from a go-faster DOS routine by
1339 * Jorge Cwik <jorge@laser.satlink.net>
1340 */
1341
1342 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1343 unsigned long saddr, unsigned long daddr, unsigned long base)
1344 {
1345 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1346 }
1347
1348
1349
1350 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1351 unsigned long daddr, int len, struct sock *sk)
1352 {
1353 th->check = 0;
1354 th->check = tcp_check(th, len, saddr, daddr,
1355 csum_partial((char *)th,len,0));
1356 return;
1357 }
1358
1359 /*
1360 * This is the main buffer sending routine. We queue the buffer
1361 * having checked it is sane seeming.
1362 */
1363
1364 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1365 {
1366 int size;
1367 struct tcphdr * th = skb->h.th;
1368
1369 /*
1370 * length of packet (not counting length of pre-tcp headers)
1371 */
1372
1373 size = skb->len - ((unsigned char *) th - skb->data);
1374
1375 /*
1376 * Sanity check it..
1377 */
1378
1379 if (size < sizeof(struct tcphdr) || size > skb->len)
1380 {
1381 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1382 skb, skb->data, th, skb->len);
1383 kfree_skb(skb, FREE_WRITE);
1384 return;
1385 }
1386
1387 /*
1388 * If we have queued a header size packet.. (these crash a few
1389 * tcp stacks if ack is not set)
1390 */
1391
1392 if (size == sizeof(struct tcphdr))
1393 {
1394 /* If it's got a syn or fin it's notionally included in the size..*/
1395 if(!th->syn && !th->fin)
1396 {
1397 printk("tcp_send_skb: attempt to queue a bogon.\n");
1398 kfree_skb(skb,FREE_WRITE);
1399 return;
1400 }
1401 }
1402
1403 /*
1404 * Actual processing.
1405 */
1406
1407 tcp_statistics.TcpOutSegs++;
1408 skb->seq = ntohl(th->seq);
1409 skb->end_seq = skb->seq + size - 4*th->doff;
1410
1411 /*
1412 * We must queue if
1413 *
1414 * a) The right edge of this frame exceeds the window
1415 * b) We are retransmitting (Nagle's rule)
1416 * c) We have too many packets 'in flight'
1417 */
1418
1419 if (after(skb->end_seq, sk->window_seq) ||
1420 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1421 sk->packets_out >= sk->cong_window)
1422 {
1423 /* checksum will be supplied by tcp_write_xmit. So
1424 * we shouldn't need to set it at all. I'm being paranoid */
1425 th->check = 0;
1426 if (skb->next != NULL)
1427 {
1428 printk("tcp_send_partial: next != NULL\n");
1429 skb_unlink(skb);
1430 }
1431 skb_queue_tail(&sk->write_queue, skb);
1432
1433 /*
1434 * If we don't fit we have to start the zero window
1435 * probes. This is broken - we really need to do a partial
1436 * send _first_ (This is what causes the Cisco and PC/TCP
1437 * grief).
1438 */
1439
1440 if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
1441 sk->send_head == NULL && sk->ack_backlog == 0)
1442 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1443 }
1444 else
1445 {
1446 /*
1447 * This is going straight out
1448 */
1449
1450 th->ack_seq = htonl(sk->acked_seq);
1451 th->window = htons(tcp_select_window(sk));
1452
1453 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1454
1455 sk->sent_seq = sk->write_seq;
1456
1457 /*
1458 * This is mad. The tcp retransmit queue is put together
1459 * by the ip layer. This causes half the problems with
1460 * unroutable FIN's and other things.
1461 */
1462
1463 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1464
1465 /*
1466 * Set for next retransmit based on expected ACK time.
1467 * FIXME: We set this every time which means our
1468 * retransmits are really about a window behind.
1469 */
1470
1471 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1472 }
1473 }
1474
1475 /*
1476 * Locking problems lead us to a messy situation where we can have
1477 * multiple partially complete buffers queued up. This is really bad
1478 * as we don't want to be sending partial buffers. Fix this with
1479 * a semaphore or similar to lock tcp_write per socket.
1480 *
1481 * These routines are pretty self descriptive.
1482 */
1483
1484 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1485 {
1486 struct sk_buff * skb;
1487 unsigned long flags;
1488
1489 save_flags(flags);
1490 cli();
1491 skb = sk->partial;
1492 if (skb) {
1493 sk->partial = NULL;
1494 del_timer(&sk->partial_timer);
1495 }
1496 restore_flags(flags);
1497 return skb;
1498 }
1499
1500 /*
1501 * Empty the partial queue
1502 */
1503
1504 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1505 {
1506 struct sk_buff *skb;
1507
1508 if (sk == NULL)
1509 return;
1510 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1511 tcp_send_skb(sk, skb);
1512 }
1513
1514 /*
1515 * Queue a partial frame
1516 */
1517
1518 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1519 {
1520 struct sk_buff * tmp;
1521 unsigned long flags;
1522
1523 save_flags(flags);
1524 cli();
1525 tmp = sk->partial;
1526 if (tmp)
1527 del_timer(&sk->partial_timer);
1528 sk->partial = skb;
1529 init_timer(&sk->partial_timer);
1530 /*
1531 * Wait up to 1 second for the buffer to fill.
1532 */
1533 sk->partial_timer.expires = jiffies+HZ;
1534 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1535 sk->partial_timer.data = (unsigned long) sk;
1536 add_timer(&sk->partial_timer);
1537 restore_flags(flags);
1538 if (tmp)
1539 tcp_send_skb(sk, tmp);
1540 }
1541
1542
1543 /*
1544 * This routine sends an ack and also updates the window.
1545 */
1546
1547 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1548 struct sock *sk,
1549 struct tcphdr *th, unsigned long daddr)
1550 {
1551 struct sk_buff *buff;
1552 struct tcphdr *t1;
1553 struct device *dev = NULL;
1554 int tmp;
1555
1556 if(sk->zapped)
1557 return; /* We have been reset, we may not send again */
1558
1559 /*
1560 * We need to grab some memory, and put together an ack,
1561 * and then put it into the queue to be sent.
1562 */
1563
1564 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1565 if (buff == NULL)
1566 {
1567 /*
1568 * Force it to send an ack. We don't have to do this
1569 * (ACK is unreliable) but it's much better use of
1570 * bandwidth on slow links to send a spare ack than
1571 * resend packets.
1572 */
1573
1574 sk->ack_backlog++;
1575 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1576 {
1577 reset_xmit_timer(sk, TIME_WRITE, HZ);
1578 }
1579 return;
1580 }
1581
1582 /*
1583 * Assemble a suitable TCP frame
1584 */
1585
1586 buff->sk = sk;
1587 buff->localroute = sk->localroute;
1588
1589 /*
1590 * Put in the IP header and routing stuff.
1591 */
1592
1593 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1594 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1595 if (tmp < 0)
1596 {
1597 buff->free = 1;
1598 sock_wfree(sk, buff);
1599 return;
1600 }
1601 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1602
1603 memcpy(t1, th, sizeof(*t1));
1604
1605 /*
1606 * Swap the send and the receive.
1607 */
1608
1609 t1->dest = th->source;
1610 t1->source = th->dest;
1611 t1->seq = ntohl(sequence);
1612 t1->ack = 1;
1613 sk->window = tcp_select_window(sk);
1614 t1->window = ntohs(sk->window);
1615 t1->res1 = 0;
1616 t1->res2 = 0;
1617 t1->rst = 0;
1618 t1->urg = 0;
1619 t1->syn = 0;
1620 t1->psh = 0;
1621 t1->fin = 0;
1622
1623 /*
1624 * If we have nothing queued for transmit and the transmit timer
1625 * is on we are just doing an ACK timeout and need to switch
1626 * to a keepalive.
1627 */
1628
1629 if (ack == sk->acked_seq)
1630 {
1631 sk->ack_backlog = 0;
1632 sk->bytes_rcv = 0;
1633 sk->ack_timed = 0;
1634 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1635 && sk->ip_xmit_timeout == TIME_WRITE)
1636 {
1637 if(sk->keepopen) {
1638 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1639 } else {
1640 delete_timer(sk);
1641 }
1642 }
1643 }
1644
1645 /*
1646 * Fill in the packet and send it
1647 */
1648
1649 t1->ack_seq = htonl(ack);
1650 t1->doff = sizeof(*t1)/4;
1651 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1652 if (sk->debug)
1653 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1654 tcp_statistics.TcpOutSegs++;
1655 sk->prot->queue_xmit(sk, dev, buff, 1);
1656 }
1657
1658
1659 /*
1660 * This routine builds a generic TCP header.
1661 */
1662
1663 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1664 {
1665
1666 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1667 th->seq = htonl(sk->write_seq);
1668 th->psh =(push == 0) ? 1 : 0;
1669 th->doff = sizeof(*th)/4;
1670 th->ack = 1;
1671 th->fin = 0;
1672 sk->ack_backlog = 0;
1673 sk->bytes_rcv = 0;
1674 sk->ack_timed = 0;
1675 th->ack_seq = htonl(sk->acked_seq);
1676 sk->window = tcp_select_window(sk);
1677 th->window = htons(sk->window);
1678
1679 return(sizeof(*th));
1680 }
1681
1682 /*
1683 * This routine copies from a user buffer into a socket,
1684 * and starts the transmit system.
1685 */
1686
1687 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1688 int len, int nonblock, int flags)
1689 {
1690 int copied = 0;
1691 int copy;
1692 int tmp;
1693 int seglen;
1694 int iovct=0;
1695 struct sk_buff *skb;
1696 struct sk_buff *send_tmp;
1697 struct proto *prot;
1698 struct device *dev = NULL;
1699 unsigned char *from;
1700
1701 /*
1702 * Do sanity checking for sendmsg/sendto/send
1703 */
1704
1705 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1706 return -EINVAL;
1707 if (msg->msg_name)
1708 {
1709 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1710 if(sk->state == TCP_CLOSE)
1711 return -ENOTCONN;
1712 if (msg->msg_namelen < sizeof(*addr))
1713 return -EINVAL;
1714 if (addr->sin_family && addr->sin_family != AF_INET)
1715 return -EINVAL;
1716 if (addr->sin_port != sk->dummy_th.dest)
1717 return -EISCONN;
1718 if (addr->sin_addr.s_addr != sk->daddr)
1719 return -EISCONN;
1720 }
1721
1722 /*
1723 * Ok commence sending
1724 */
1725
1726 while(iovct<msg->msg_iovlen)
1727 {
1728 seglen=msg->msg_iov[iovct].iov_len;
1729 from=msg->msg_iov[iovct++].iov_base;
1730 sk->inuse=1;
1731 prot = sk->prot;
1732 while(seglen > 0)
1733 {
1734 if (sk->err)
1735 { /* Stop on an error */
1736 release_sock(sk);
1737 if (copied)
1738 return(copied);
1739 return sock_error(sk);
1740 }
1741
1742 /*
1743 * First thing we do is make sure that we are established.
1744 */
1745
1746 if (sk->shutdown & SEND_SHUTDOWN)
1747 {
1748 release_sock(sk);
1749 sk->err = EPIPE;
1750 if (copied)
1751 return(copied);
1752 sk->err = 0;
1753 return(-EPIPE);
1754 }
1755
1756 /*
1757 * Wait for a connection to finish.
1758 */
1759
1760 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1761 {
1762 if (sk->err)
1763 {
1764 release_sock(sk);
1765 if (copied)
1766 return(copied);
1767 return sock_error(sk);
1768 }
1769
1770 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1771 {
1772 release_sock(sk);
1773 if (copied)
1774 return(copied);
1775
1776 if (sk->err)
1777 return sock_error(sk);
1778
1779 if (sk->keepopen)
1780 {
1781 send_sig(SIGPIPE, current, 0);
1782 }
1783 return(-EPIPE);
1784 }
1785
1786 if (nonblock || copied)
1787 {
1788 release_sock(sk);
1789 if (copied)
1790 return(copied);
1791 return(-EAGAIN);
1792 }
1793
1794 release_sock(sk);
1795 cli();
1796
1797 if (sk->state != TCP_ESTABLISHED &&
1798 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1799 {
1800 interruptible_sleep_on(sk->sleep);
1801 if (current->signal & ~current->blocked)
1802 {
1803 sti();
1804 if (copied)
1805 return(copied);
1806 return(-ERESTARTSYS);
1807 }
1808 }
1809 sk->inuse = 1;
1810 sti();
1811 }
1812
1813 /*
1814 * The following code can result in copy <= if sk->mss is ever
1815 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1816 * sk->mtu is constant once SYN processing is finished. I.e. we
1817 * had better not get here until we've seen his SYN and at least one
1818 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1819 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1820 * non-decreasing. Note that any ioctl to set user_mss must be done
1821 * before the exchange of SYN's. If the initial ack from the other
1822 * end has a window of 0, max_window and thus mss will both be 0.
1823 */
1824
1825 /*
1826 * Now we need to check if we have a half built packet.
1827 */
1828 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1829 /*
1830 * FIXME: I'm almost sure that this fragment is BUG,
1831 * but it works... I do not know why 8) --ANK
1832 *
1833 * Really, we should rebuild all the queues...
1834 * It's difficult. Temprorary hack is to send all
1835 * queued segments with allowed fragmentation.
1836 */
1837 {
1838 int new_mss = min(sk->mtu, sk->max_window);
1839 if (new_mss < sk->mss)
1840 {
1841 tcp_send_partial(sk);
1842 sk->mss = new_mss;
1843 }
1844 }
1845 #endif
1846
1847 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1848 {
1849 int hdrlen;
1850
1851 /* IP header + TCP header */
1852 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1853 + sizeof(struct tcphdr);
1854
1855 /* Add more stuff to the end of skb->len */
1856 if (!(flags & MSG_OOB))
1857 {
1858 copy = min(sk->mss - (skb->len - hdrlen), seglen);
1859 if (copy <= 0)
1860 {
1861 printk("TCP: **bug**: \"copy\" <= 0\n");
1862 return -EFAULT;
1863 }
1864 memcpy_fromfs(skb_put(skb,copy), from, copy);
1865 from += copy;
1866 copied += copy;
1867 len -= copy;
1868 sk->write_seq += copy;
1869 seglen -= copy;
1870 }
1871 if ((skb->len - hdrlen) >= sk->mss ||
1872 (flags & MSG_OOB) || !sk->packets_out)
1873 tcp_send_skb(sk, skb);
1874 else
1875 tcp_enqueue_partial(skb, sk);
1876 continue;
1877 }
1878
1879 /*
1880 * We also need to worry about the window.
1881 * If window < 1/2 the maximum window we've seen from this
1882 * host, don't use it. This is sender side
1883 * silly window prevention, as specified in RFC1122.
1884 * (Note that this is different than earlier versions of
1885 * SWS prevention, e.g. RFC813.). What we actually do is
1886 * use the whole MSS. Since the results in the right
1887 * edge of the packet being outside the window, it will
1888 * be queued for later rather than sent.
1889 */
1890
1891 copy = sk->window_seq - sk->write_seq;
1892 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1893 copy = sk->mss;
1894 if (copy > seglen)
1895 copy = seglen;
1896
1897 /*
1898 * We should really check the window here also.
1899 */
1900
1901 send_tmp = NULL;
1902 if (copy < sk->mss && !(flags & MSG_OOB))
1903 {
1904 /*
1905 * We will release the socket in case we sleep here.
1906 */
1907 release_sock(sk);
1908 /*
1909 * NB: following must be mtu, because mss can be increased.
1910 * mss is always <= mtu
1911 */
1912 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1913 sk->inuse = 1;
1914 send_tmp = skb;
1915 }
1916 else
1917 {
1918 /*
1919 * We will release the socket in case we sleep here.
1920 */
1921 release_sock(sk);
1922 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1923 sk->inuse = 1;
1924 }
1925
1926 /*
1927 * If we didn't get any memory, we need to sleep.
1928 */
1929
1930 if (skb == NULL)
1931 {
1932 sk->socket->flags |= SO_NOSPACE;
1933 if (nonblock)
1934 {
1935 release_sock(sk);
1936 if (copied)
1937 return(copied);
1938 return(-EAGAIN);
1939 }
1940
1941 /*
1942 * FIXME: here is another race condition.
1943 */
1944
1945 tmp = sk->wmem_alloc;
1946 release_sock(sk);
1947 cli();
1948 /*
1949 * Again we will try to avoid it.
1950 */
1951 if (tmp <= sk->wmem_alloc &&
1952 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1953 && sk->err == 0)
1954 {
1955 sk->socket->flags &= ~SO_NOSPACE;
1956 interruptible_sleep_on(sk->sleep);
1957 if (current->signal & ~current->blocked)
1958 {
1959 sti();
1960 if (copied)
1961 return(copied);
1962 return(-ERESTARTSYS);
1963 }
1964 }
1965 sk->inuse = 1;
1966 sti();
1967 continue;
1968 }
1969
1970 skb->sk = sk;
1971 skb->free = 0;
1972 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1973
1974 /*
1975 * FIXME: we need to optimize this.
1976 * Perhaps some hints here would be good.
1977 */
1978
1979 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1980 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1981 if (tmp < 0 )
1982 {
1983 sock_wfree(sk, skb);
1984 release_sock(sk);
1985 if (copied)
1986 return(copied);
1987 return(tmp);
1988 }
1989 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1990 skb->ip_hdr->frag_off |= htons(IP_DF);
1991 #endif
1992 skb->dev = dev;
1993 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1994 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1995 if (tmp < 0)
1996 {
1997 sock_wfree(sk, skb);
1998 release_sock(sk);
1999 if (copied)
2000 return(copied);
2001 return(tmp);
2002 }
2003
2004 if (flags & MSG_OOB)
2005 {
2006 skb->h.th->urg = 1;
2007 skb->h.th->urg_ptr = ntohs(copy);
2008 }
2009
2010 memcpy_fromfs(skb_put(skb,copy), from, copy);
2011
2012 from += copy;
2013 copied += copy;
2014 len -= copy;
2015 seglen -= copy;
2016 skb->free = 0;
2017 sk->write_seq += copy;
2018
2019 if (send_tmp != NULL && sk->packets_out)
2020 {
2021 tcp_enqueue_partial(send_tmp, sk);
2022 continue;
2023 }
2024 tcp_send_skb(sk, skb);
2025 }
2026 }
2027 sk->err = 0;
2028
2029 /*
2030 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
2031 * interactive fast network servers. It's meant to be on and
2032 * it really improves the throughput though not the echo time
2033 * on my slow slip link - Alan
2034 */
2035
2036 /*
2037 * Avoid possible race on send_tmp - c/o Johannes Stille
2038 */
2039
2040 if(sk->partial && ((!sk->packets_out)
2041 /* If not nagling we can send on the before case too.. */
2042 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
2043 ))
2044 tcp_send_partial(sk);
2045
2046 release_sock(sk);
2047 return(copied);
2048 }
2049
2050 /*
2051 * Send an ack if one is backlogged at this point. Ought to merge
2052 * this with tcp_send_ack().
2053 */
2054
2055 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2056 {
2057 int tmp;
2058 struct device *dev = NULL;
2059 struct tcphdr *t1;
2060 struct sk_buff *buff;
2061
2062 if (!sk->ack_backlog)
2063 return;
2064
2065 /*
2066 * If we're closed, don't send an ack, or we'll get a RST
2067 * from the closed destination.
2068 */
2069 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2070 return;
2071
2072 /*
2073 * FIXME: we need to put code here to prevent this routine from
2074 * being called. Being called once in a while is ok, so only check
2075 * if this is the second time in a row.
2076 */
2077
2078 /*
2079 * We need to grab some memory, and put together an ack,
2080 * and then put it into the queue to be sent.
2081 */
2082
2083 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2084 if (buff == NULL)
2085 {
2086 /* Try again real soon. */
2087 reset_xmit_timer(sk, TIME_WRITE, HZ);
2088 return;
2089 }
2090
2091 buff->sk = sk;
2092 buff->localroute = sk->localroute;
2093
2094 /*
2095 * Put in the IP header and routing stuff.
2096 */
2097
2098 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2099 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2100 if (tmp < 0)
2101 {
2102 buff->free = 1;
2103 sock_wfree(sk, buff);
2104 return;
2105 }
2106
2107 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2108
2109 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2110 t1->seq = htonl(sk->sent_seq);
2111 t1->ack = 1;
2112 t1->res1 = 0;
2113 t1->res2 = 0;
2114 t1->rst = 0;
2115 t1->urg = 0;
2116 t1->syn = 0;
2117 t1->psh = 0;
2118 sk->ack_backlog = 0;
2119 sk->bytes_rcv = 0;
2120 sk->window = tcp_select_window(sk);
2121 t1->window = htons(sk->window);
2122 t1->ack_seq = htonl(sk->acked_seq);
2123 t1->doff = sizeof(*t1)/4;
2124 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2125 sk->prot->queue_xmit(sk, dev, buff, 1);
2126 tcp_statistics.TcpOutSegs++;
2127 }
2128
2129
2130 /*
2131 * FIXME:
2132 * This routine frees used buffers.
2133 * It should consider sending an ACK to let the
2134 * other end know we now have a bigger window.
2135 */
2136
2137 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2138 {
2139 unsigned long flags;
2140 unsigned long left;
2141 struct sk_buff *skb;
2142 unsigned long rspace;
2143
2144 if(sk->debug)
2145 printk("cleaning rbuf for sk=%p\n", sk);
2146
2147 save_flags(flags);
2148 cli();
2149
2150 left = sock_rspace(sk);
2151
2152 /*
2153 * We have to loop through all the buffer headers,
2154 * and try to free up all the space we can.
2155 */
2156
2157 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2158 {
2159 if (!skb->used || skb->users)
2160 break;
2161 skb_unlink(skb);
2162 skb->sk = sk;
2163 kfree_skb(skb, FREE_READ);
2164 }
2165
2166 restore_flags(flags);
2167
2168 /*
2169 * FIXME:
2170 * At this point we should send an ack if the difference
2171 * in the window, and the amount of space is bigger than
2172 * TCP_WINDOW_DIFF.
2173 */
2174
2175 if(sk->debug)
2176 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2177 left);
2178 if ((rspace=sock_rspace(sk)) != left)
2179 {
2180 /*
2181 * This area has caused the most trouble. The current strategy
2182 * is to simply do nothing if the other end has room to send at
2183 * least 3 full packets, because the ack from those will auto-
2184 * matically update the window. If the other end doesn't think
2185 * we have much space left, but we have room for at least 1 more
2186 * complete packet than it thinks we do, we will send an ack
2187 * immediately. Otherwise we will wait up to .5 seconds in case
2188 * the user reads some more.
2189 */
2190 sk->ack_backlog++;
2191 /*
2192 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
2193 * if the other end is offering a window smaller than the agreed on MSS
2194 * (called sk->mtu here). In theory there's no connection between send
2195 * and receive, and so no reason to think that they're going to send
2196 * small packets. For the moment I'm using the hack of reducing the mss
2197 * only on the send side, so I'm putting mtu here.
2198 */
2199
2200 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2201 {
2202 /* Send an ack right now. */
2203 tcp_read_wakeup(sk);
2204 }
2205 else
2206 {
2207 /* Force it to send an ack soon. */
2208 int was_active = del_timer(&sk->retransmit_timer);
2209 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2210 {
2211 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2212 }
2213 else
2214 add_timer(&sk->retransmit_timer);
2215 }
2216 }
2217 }
2218
2219
2220 /*
2221 * Handle reading urgent data. BSD has very simple semantics for
2222 * this, no blocking and very strange errors 8)
2223 */
2224
2225 static int tcp_recv_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2226 struct msghdr *msg, int len, int flags, int *addr_len)
2227 {
2228 /*
2229 * No URG data to read
2230 */
2231 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2232 return -EINVAL; /* Yes this is right ! */
2233
2234 if (sk->err)
2235 return sock_error(sk);
2236
2237 if (sk->state == TCP_CLOSE || sk->done)
2238 {
2239 if (!sk->done)
2240 {
2241 sk->done = 1;
2242 return 0;
2243 }
2244 return -ENOTCONN;
2245 }
2246
2247 if (sk->shutdown & RCV_SHUTDOWN)
2248 {
2249 sk->done = 1;
2250 return 0;
2251 }
2252 sk->inuse = 1;
2253 if (sk->urg_data & URG_VALID)
2254 {
2255 char c = sk->urg_data;
2256 if (!(flags & MSG_PEEK))
2257 sk->urg_data = URG_READ;
2258 memcpy_toiovec(msg->msg_iov, &c, 1);
2259 if(msg->msg_name)
2260 {
2261 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2262 sin->sin_family=AF_INET;
2263 sin->sin_addr.s_addr=sk->daddr;
2264 sin->sin_port=sk->dummy_th.dest;
2265 }
2266 if(addr_len)
2267 *addr_len=sizeof(struct sockaddr_in);
2268 release_sock(sk);
2269 return 1;
2270 }
2271 release_sock(sk);
2272
2273 /*
2274 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
2275 * the available implementations agree in this case:
2276 * this call should never block, independent of the
2277 * blocking state of the socket.
2278 * Mike <pall@rz.uni-karlsruhe.de>
2279 */
2280 return -EAGAIN;
2281 }
2282
2283
2284 /*
2285 * This routine copies from a sock struct into the user buffer.
2286 */
2287
2288 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2289 int len, int nonblock, int flags, int *addr_len)
2290 {
2291 struct wait_queue wait = { current, NULL };
2292 int copied = 0;
2293 u32 peek_seq;
2294 volatile u32 *seq; /* So gcc doesn't overoptimise */
2295 unsigned long used;
2296
2297 /*
2298 * This error should be checked.
2299 */
2300
2301 if (sk->state == TCP_LISTEN)
2302 return -ENOTCONN;
2303
2304 /*
2305 * Urgent data needs to be handled specially.
2306 */
2307
2308 if (flags & MSG_OOB)
2309 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2310
2311 /*
2312 * Copying sequence to update. This is volatile to handle
2313 * the multi-reader case neatly (memcpy_to/fromfs might be
2314 * inline and thus not flush cached variables otherwise).
2315 */
2316
2317 peek_seq = sk->copied_seq;
2318 seq = &sk->copied_seq;
2319 if (flags & MSG_PEEK)
2320 seq = &peek_seq;
2321
2322 add_wait_queue(sk->sleep, &wait);
2323 sk->inuse = 1;
2324 while (len > 0)
2325 {
2326 struct sk_buff * skb;
2327 u32 offset;
2328
2329 /*
2330 * Are we at urgent data? Stop if we have read anything.
2331 */
2332
2333 if (copied && sk->urg_data && sk->urg_seq == *seq)
2334 break;
2335
2336 /*
2337 * Next get a buffer.
2338 */
2339
2340 current->state = TASK_INTERRUPTIBLE;
2341
2342 skb = skb_peek(&sk->receive_queue);
2343 do
2344 {
2345 if (!skb)
2346 break;
2347 if (before(*seq, skb->seq))
2348 break;
2349 offset = *seq - skb->seq;
2350 if (skb->h.th->syn)
2351 offset--;
2352 if (offset < skb->len)
2353 goto found_ok_skb;
2354 if (skb->h.th->fin)
2355 goto found_fin_ok;
2356 if (!(flags & MSG_PEEK))
2357 skb->used = 1;
2358 skb = skb->next;
2359 }
2360 while (skb != (struct sk_buff *)&sk->receive_queue);
2361
2362 if (copied)
2363 break;
2364
2365 if (sk->err)
2366 {
2367 copied = sock_error(sk);
2368 break;
2369 }
2370
2371 if (sk->state == TCP_CLOSE)
2372 {
2373 if (!sk->done)
2374 {
2375 sk->done = 1;
2376 break;
2377 }
2378 copied = -ENOTCONN;
2379 break;
2380 }
2381
2382 if (sk->shutdown & RCV_SHUTDOWN)
2383 {
2384 sk->done = 1;
2385 break;
2386 }
2387
2388 if (nonblock)
2389 {
2390 copied = -EAGAIN;
2391 break;
2392 }
2393
2394 cleanup_rbuf(sk);
2395 release_sock(sk);
2396 sk->socket->flags |= SO_WAITDATA;
2397 schedule();
2398 sk->socket->flags &= ~SO_WAITDATA;
2399 sk->inuse = 1;
2400
2401 if (current->signal & ~current->blocked)
2402 {
2403 copied = -ERESTARTSYS;
2404 break;
2405 }
2406 continue;
2407
2408 found_ok_skb:
2409 /*
2410 * Lock the buffer. We can be fairly relaxed as
2411 * an interrupt will never steal a buffer we are
2412 * using unless I've missed something serious in
2413 * tcp_data.
2414 */
2415
2416 skb->users++;
2417
2418 /*
2419 * Ok so how much can we use ?
2420 */
2421
2422 used = skb->len - offset;
2423 if (len < used)
2424 used = len;
2425 /*
2426 * Do we have urgent data here?
2427 */
2428
2429 if (sk->urg_data)
2430 {
2431 u32 urg_offset = sk->urg_seq - *seq;
2432 if (urg_offset < used)
2433 {
2434 if (!urg_offset)
2435 {
2436 if (!sk->urginline)
2437 {
2438 ++*seq;
2439 offset++;
2440 used--;
2441 }
2442 }
2443 else
2444 used = urg_offset;
2445 }
2446 }
2447
2448 /*
2449 * Copy it - We _MUST_ update *seq first so that we
2450 * don't ever double read when we have dual readers
2451 */
2452
2453 *seq += used;
2454
2455 /*
2456 * This memcpy_tofs can sleep. If it sleeps and we
2457 * do a second read it relies on the skb->users to avoid
2458 * a crash when cleanup_rbuf() gets called.
2459 */
2460
2461 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2462 skb->h.th->doff*4 + offset, used);
2463 copied += used;
2464 len -= used;
2465
2466 /*
2467 * We now will not sleep again until we are finished
2468 * with skb. Sorry if you are doing the SMP port
2469 * but you'll just have to fix it neatly ;)
2470 */
2471
2472 skb->users --;
2473
2474 if (after(sk->copied_seq,sk->urg_seq))
2475 sk->urg_data = 0;
2476 if (used + offset < skb->len)
2477 continue;
2478
2479 /*
2480 * Process the FIN.
2481 */
2482
2483 if (skb->h.th->fin)
2484 goto found_fin_ok;
2485 if (flags & MSG_PEEK)
2486 continue;
2487 skb->used = 1;
2488 continue;
2489
2490 found_fin_ok:
2491 ++*seq;
2492 if (flags & MSG_PEEK)
2493 break;
2494
2495 /*
2496 * All is done
2497 */
2498
2499 skb->used = 1;
2500 sk->shutdown |= RCV_SHUTDOWN;
2501 break;
2502
2503 }
2504
2505 if(copied>0 && msg->msg_name)
2506 {
2507 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2508 sin->sin_family=AF_INET;
2509 sin->sin_addr.s_addr=sk->daddr;
2510 sin->sin_port=sk->dummy_th.dest;
2511 }
2512 if(addr_len)
2513 *addr_len=sizeof(struct sockaddr_in);
2514
2515 remove_wait_queue(sk->sleep, &wait);
2516 current->state = TASK_RUNNING;
2517
2518 /* Clean up data we have read: This will do ACK frames */
2519 cleanup_rbuf(sk);
2520 release_sock(sk);
2521 return copied;
2522 }
2523
2524
2525
2526 /*
2527 * State processing on a close. This implements the state shift for
2528 * sending our FIN frame. Note that we only send a FIN for some
2529 * states. A shutdown() may have already sent the FIN, or we may be
2530 * closed.
2531 */
2532
2533 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2534 {
2535 int ns=TCP_CLOSE;
2536 int send_fin=0;
2537 switch(sk->state)
2538 {
2539 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2540 break;
2541 case TCP_SYN_RECV:
2542 case TCP_ESTABLISHED: /* Closedown begin */
2543 ns=TCP_FIN_WAIT1;
2544 send_fin=1;
2545 break;
2546 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2547 case TCP_FIN_WAIT2:
2548 case TCP_CLOSING:
2549 ns=sk->state;
2550 break;
2551 case TCP_CLOSE:
2552 case TCP_LISTEN:
2553 break;
2554 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2555 wait only for the ACK */
2556 ns=TCP_LAST_ACK;
2557 send_fin=1;
2558 }
2559
2560 tcp_set_state(sk,ns);
2561
2562 /*
2563 * This is a (useful) BSD violating of the RFC. There is a
2564 * problem with TCP as specified in that the other end could
2565 * keep a socket open forever with no application left this end.
2566 * We use a 3 minute timeout (about the same as BSD) then kill
2567 * our end. If they send after that then tough - BUT: long enough
2568 * that we won't make the old 4*rto = almost no time - whoops
2569 * reset mistake.
2570 */
2571 if(dead && ns==TCP_FIN_WAIT2)
2572 {
2573 int timer_active=del_timer(&sk->timer);
2574 if(timer_active)
2575 add_timer(&sk->timer);
2576 else
2577 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2578 }
2579
2580 return send_fin;
2581 }
2582
2583 /*
2584 * Send a fin.
2585 */
2586
2587 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2588 {
2589 struct proto *prot =(struct proto *)sk->prot;
2590 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2591 struct tcphdr *t1;
2592 struct sk_buff *buff;
2593 struct device *dev=NULL;
2594 int tmp;
2595
2596 release_sock(sk); /* in case the malloc sleeps. */
2597
2598 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2599 sk->inuse = 1;
2600
2601 if (buff == NULL)
2602 {
2603 /* This is a disaster if it occurs */
2604 printk("tcp_send_fin: Impossible malloc failure");
2605 return;
2606 }
2607
2608 /*
2609 * Administrivia
2610 */
2611
2612 buff->sk = sk;
2613 buff->localroute = sk->localroute;
2614
2615 /*
2616 * Put in the IP header and routing stuff.
2617 */
2618
2619 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2620 IPPROTO_TCP, sk->opt,
2621 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2622 if (tmp < 0)
2623 {
2624 int t;
2625 /*
2626 * Finish anyway, treat this as a send that got lost.
2627 * (Not good).
2628 */
2629
2630 buff->free = 1;
2631 sock_wfree(sk,buff);
2632 sk->write_seq++;
2633 t=del_timer(&sk->timer);
2634 if(t)
2635 add_timer(&sk->timer);
2636 else
2637 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2638 return;
2639 }
2640
2641 /*
2642 * We ought to check if the end of the queue is a buffer and
2643 * if so simply add the fin to that buffer, not send it ahead.
2644 */
2645
2646 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2647 buff->dev = dev;
2648 memcpy(t1, th, sizeof(*t1));
2649 buff->seq = sk->write_seq;
2650 sk->write_seq++;
2651 buff->end_seq = sk->write_seq;
2652 t1->seq = htonl(buff->seq);
2653 t1->ack = 1;
2654 t1->ack_seq = htonl(sk->acked_seq);
2655 t1->window = htons(sk->window=tcp_select_window(sk));
2656 t1->fin = 1;
2657 t1->rst = 0;
2658 t1->doff = sizeof(*t1)/4;
2659 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2660
2661 /*
2662 * If there is data in the write queue, the fin must be appended to
2663 * the write queue.
2664 */
2665
2666 if (skb_peek(&sk->write_queue) != NULL)
2667 {
2668 buff->free = 0;
2669 if (buff->next != NULL)
2670 {
2671 printk("tcp_send_fin: next != NULL\n");
2672 skb_unlink(buff);
2673 }
2674 skb_queue_tail(&sk->write_queue, buff);
2675 }
2676 else
2677 {
2678 sk->sent_seq = sk->write_seq;
2679 sk->prot->queue_xmit(sk, dev, buff, 0);
2680 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2681 }
2682 }
2683
2684 /*
2685 * Shutdown the sending side of a connection. Much like close except
2686 * that we don't receive shut down or set sk->dead=1.
2687 */
2688
2689 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2690 {
2691 /*
2692 * We need to grab some memory, and put together a FIN,
2693 * and then put it into the queue to be sent.
2694 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2695 */
2696
2697 if (!(how & SEND_SHUTDOWN))
2698 return;
2699
2700 /*
2701 * If we've already sent a FIN, or it's a closed state
2702 */
2703
2704 if (sk->state == TCP_FIN_WAIT1 ||
2705 sk->state == TCP_FIN_WAIT2 ||
2706 sk->state == TCP_CLOSING ||
2707 sk->state == TCP_LAST_ACK ||
2708 sk->state == TCP_TIME_WAIT ||
2709 sk->state == TCP_CLOSE ||
2710 sk->state == TCP_LISTEN
2711 )
2712 {
2713 return;
2714 }
2715 sk->inuse = 1;
2716
2717 /*
2718 * flag that the sender has shutdown
2719 */
2720
2721 sk->shutdown |= SEND_SHUTDOWN;
2722
2723 /*
2724 * Clear out any half completed packets.
2725 */
2726
2727 if (sk->partial)
2728 tcp_send_partial(sk);
2729
2730 /*
2731 * FIN if needed
2732 */
2733
2734 if(tcp_close_state(sk,0))
2735 tcp_send_fin(sk);
2736
2737 release_sock(sk);
2738 }
2739
2740 /*
2741 * This routine will send an RST to the other tcp.
2742 */
2743
2744 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2745 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2746 {
2747 struct sk_buff *buff;
2748 struct tcphdr *t1;
2749 int tmp;
2750 struct device *ndev=NULL;
2751
2752 /*
2753 * Cannot reset a reset (Think about it).
2754 */
2755
2756 if(th->rst)
2757 return;
2758
2759 /*
2760 * We need to grab some memory, and put together an RST,
2761 * and then put it into the queue to be sent.
2762 */
2763
2764 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2765 if (buff == NULL)
2766 return;
2767
2768 buff->sk = NULL;
2769 buff->dev = dev;
2770 buff->localroute = 0;
2771
2772 /*
2773 * Put in the IP header and routing stuff.
2774 */
2775
2776 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2777 sizeof(struct tcphdr),tos,ttl,NULL);
2778 if (tmp < 0)
2779 {
2780 buff->free = 1;
2781 sock_wfree(NULL, buff);
2782 return;
2783 }
2784
2785 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2786 memcpy(t1, th, sizeof(*t1));
2787
2788 /*
2789 * Swap the send and the receive.
2790 */
2791
2792 t1->dest = th->source;
2793 t1->source = th->dest;
2794 t1->rst = 1;
2795 t1->window = 0;
2796
2797 if(th->ack)
2798 {
2799 t1->ack = 0;
2800 t1->seq = th->ack_seq;
2801 t1->ack_seq = 0;
2802 }
2803 else
2804 {
2805 t1->ack = 1;
2806 if(!th->syn)
2807 t1->ack_seq = th->seq;
2808 else
2809 t1->ack_seq = htonl(ntohl(th->seq)+1);
2810 t1->seq = 0;
2811 }
2812
2813 t1->syn = 0;
2814 t1->urg = 0;
2815 t1->fin = 0;
2816 t1->psh = 0;
2817 t1->doff = sizeof(*t1)/4;
2818 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2819 prot->queue_xmit(NULL, ndev, buff, 1);
2820 tcp_statistics.TcpOutSegs++;
2821 }
2822
2823
2824 /*
2825 * Look for tcp options. Parses everything but only knows about MSS.
2826 * This routine is always called with the packet containing the SYN.
2827 * However it may also be called with the ack to the SYN. So you
2828 * can't assume this is always the SYN. It's always called after
2829 * we have set up sk->mtu to our own MTU.
2830 *
2831 * We need at minimum to add PAWS support here. Possibly large windows
2832 * as Linux gets deployed on 100Mb/sec networks.
2833 */
2834
2835 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2836 {
2837 unsigned char *ptr;
2838 int length=(th->doff*4)-sizeof(struct tcphdr);
2839 int mss_seen = 0;
2840
2841 ptr = (unsigned char *)(th + 1);
2842
2843 while(length>0)
2844 {
2845 int opcode=*ptr++;
2846 int opsize=*ptr++;
2847 switch(opcode)
2848 {
2849 case TCPOPT_EOL:
2850 return;
2851 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2852 length--;
2853 ptr--; /* the opsize=*ptr++ above was a mistake */
2854 continue;
2855
2856 default:
2857 if(opsize<=2) /* Avoid silly options looping forever */
2858 return;
2859 switch(opcode)
2860 {
2861 case TCPOPT_MSS:
2862 if(opsize==4 && th->syn)
2863 {
2864 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2865 mss_seen = 1;
2866 }
2867 break;
2868 /* Add other options here as people feel the urge to implement stuff like large windows */
2869 }
2870 ptr+=opsize-2;
2871 length-=opsize;
2872 }
2873 }
2874 if (th->syn)
2875 {
2876 if (! mss_seen)
2877 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2878 }
2879 #ifdef CONFIG_INET_PCTCP
2880 sk->mss = min(sk->max_window >> 1, sk->mtu);
2881 #else
2882 sk->mss = min(sk->max_window, sk->mtu);
2883 #endif
2884 }
2885
2886 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2887 {
2888 dst = ntohl(dst);
2889 if (IN_CLASSA(dst))
2890 return htonl(IN_CLASSA_NET);
2891 if (IN_CLASSB(dst))
2892 return htonl(IN_CLASSB_NET);
2893 return htonl(IN_CLASSC_NET);
2894 }
2895
2896 /*
2897 * Default sequence number picking algorithm.
2898 * As close as possible to RFC 793, which
2899 * suggests using a 250kHz clock.
2900 * Further reading shows this assumes 2MB/s networks.
2901 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2902 * That's funny, Linux has one built in! Use it!
2903 */
2904
2905 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2906 {
2907 struct timeval tv;
2908 do_gettimeofday(&tv);
2909 return tv.tv_usec+tv.tv_sec*1000000;
2910 }
2911
2912 /*
2913 * This routine handles a connection request.
2914 * It should make sure we haven't already responded.
2915 * Because of the way BSD works, we have to send a syn/ack now.
2916 * This also means it will be harder to close a socket which is
2917 * listening.
2918 */
2919
2920 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2921 unsigned long daddr, unsigned long saddr,
2922 struct options *opt, struct device *dev, u32 seq)
2923 {
2924 struct sk_buff *buff;
2925 struct tcphdr *t1;
2926 unsigned char *ptr;
2927 struct sock *newsk;
2928 struct tcphdr *th;
2929 struct device *ndev=NULL;
2930 int tmp;
2931 struct rtable *rt;
2932
2933 th = skb->h.th;
2934
2935 /* If the socket is dead, don't accept the connection. */
2936 if (!sk->dead)
2937 {
2938 sk->data_ready(sk,0);
2939 }
2940 else
2941 {
2942 if(sk->debug)
2943 printk("Reset on %p: Connect on dead socket.\n",sk);
2944 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2945 tcp_statistics.TcpAttemptFails++;
2946 kfree_skb(skb, FREE_READ);
2947 return;
2948 }
2949
2950 /*
2951 * Make sure we can accept more. This will prevent a
2952 * flurry of syns from eating up all our memory.
2953 */
2954
2955 if (sk->ack_backlog >= sk->max_ack_backlog)
2956 {
2957 tcp_statistics.TcpAttemptFails++;
2958 kfree_skb(skb, FREE_READ);
2959 return;
2960 }
2961
2962 /*
2963 * We need to build a new sock struct.
2964 * It is sort of bad to have a socket without an inode attached
2965 * to it, but the wake_up's will just wake up the listening socket,
2966 * and if the listening socket is destroyed before this is taken
2967 * off of the queue, this will take care of it.
2968 */
2969
2970 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2971 if (newsk == NULL)
2972 {
2973 /* just ignore the syn. It will get retransmitted. */
2974 tcp_statistics.TcpAttemptFails++;
2975 kfree_skb(skb, FREE_READ);
2976 return;
2977 }
2978
2979 memcpy(newsk, sk, sizeof(*newsk));
2980 newsk->opt = NULL;
2981 newsk->ip_route_cache = NULL;
2982 if (opt && opt->optlen) {
2983 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2984 if (!sk->opt) {
2985 kfree_s(newsk, sizeof(struct sock));
2986 tcp_statistics.TcpAttemptFails++;
2987 kfree_skb(skb, FREE_READ);
2988 return;
2989 }
2990 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2991 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2992 kfree_s(newsk, sizeof(struct sock));
2993 tcp_statistics.TcpAttemptFails++;
2994 kfree_skb(skb, FREE_READ);
2995 return;
2996 }
2997 }
2998 skb_queue_head_init(&newsk->write_queue);
2999 skb_queue_head_init(&newsk->receive_queue);
3000 newsk->send_head = NULL;
3001 newsk->send_tail = NULL;
3002 skb_queue_head_init(&newsk->back_log);
3003 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
3004 newsk->rto = TCP_TIMEOUT_INIT;
3005 newsk->mdev = 0;
3006 newsk->max_window = 0;
3007 newsk->cong_window = 1;
3008 newsk->cong_count = 0;
3009 newsk->ssthresh = 0;
3010 newsk->backoff = 0;
3011 newsk->blog = 0;
3012 newsk->intr = 0;
3013 newsk->proc = 0;
3014 newsk->done = 0;
3015 newsk->partial = NULL;
3016 newsk->pair = NULL;
3017 newsk->wmem_alloc = 0;
3018 newsk->rmem_alloc = 0;
3019 newsk->localroute = sk->localroute;
3020
3021 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
3022
3023 newsk->err = 0;
3024 newsk->shutdown = 0;
3025 newsk->ack_backlog = 0;
3026 newsk->acked_seq = skb->seq+1;
3027 newsk->copied_seq = skb->seq+1;
3028 newsk->fin_seq = skb->seq;
3029 newsk->state = TCP_SYN_RECV;
3030 newsk->timeout = 0;
3031 newsk->ip_xmit_timeout = 0;
3032 newsk->write_seq = seq;
3033 newsk->window_seq = newsk->write_seq;
3034 newsk->rcv_ack_seq = newsk->write_seq;
3035 newsk->urg_data = 0;
3036 newsk->retransmits = 0;
3037 newsk->linger=0;
3038 newsk->destroy = 0;
3039 init_timer(&newsk->timer);
3040 newsk->timer.data = (unsigned long)newsk;
3041 newsk->timer.function = &net_timer;
3042 init_timer(&newsk->retransmit_timer);
3043 newsk->retransmit_timer.data = (unsigned long)newsk;
3044 newsk->retransmit_timer.function=&retransmit_timer;
3045 newsk->dummy_th.source = skb->h.th->dest;
3046 newsk->dummy_th.dest = skb->h.th->source;
3047
3048 /*
3049 * Swap these two, they are from our point of view.
3050 */
3051
3052 newsk->daddr = saddr;
3053 newsk->saddr = daddr;
3054 newsk->rcv_saddr = daddr;
3055
3056 put_sock(newsk->num,newsk);
3057 newsk->dummy_th.res1 = 0;
3058 newsk->dummy_th.doff = 6;
3059 newsk->dummy_th.fin = 0;
3060 newsk->dummy_th.syn = 0;
3061 newsk->dummy_th.rst = 0;
3062 newsk->dummy_th.psh = 0;
3063 newsk->dummy_th.ack = 0;
3064 newsk->dummy_th.urg = 0;
3065 newsk->dummy_th.res2 = 0;
3066 newsk->acked_seq = skb->seq + 1;
3067 newsk->copied_seq = skb->seq + 1;
3068 newsk->socket = NULL;
3069
3070 /*
3071 * Grab the ttl and tos values and use them
3072 */
3073
3074 newsk->ip_ttl=sk->ip_ttl;
3075 newsk->ip_tos=skb->ip_hdr->tos;
3076
3077 /*
3078 * Use 512 or whatever user asked for
3079 */
3080
3081 /*
3082 * Note use of sk->user_mss, since user has no direct access to newsk
3083 */
3084
3085 rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
3086 newsk->ip_route_cache = rt;
3087
3088 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3089 newsk->window_clamp = rt->rt_window;
3090 else
3091 newsk->window_clamp = 0;
3092
3093 if (sk->user_mss)
3094 newsk->mtu = sk->user_mss;
3095 else if (rt)
3096 newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
3097 else
3098 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3099
3100 /*
3101 * But not bigger than device MTU
3102 */
3103
3104 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3105
3106 #ifdef CONFIG_SKIP
3107
3108 /*
3109 * SKIP devices set their MTU to 65535. This is so they can take packets
3110 * unfragmented to security process then fragment. They could lie to the
3111 * TCP layer about a suitable MTU, but its easier to let skip sort it out
3112 * simply because the final package we want unfragmented is going to be
3113 *
3114 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
3115 */
3116
3117 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
3118 sk->mtu=skip_pick_mtu(sk->mtu,dev);
3119 #endif
3120 /*
3121 * This will min with what arrived in the packet
3122 */
3123
3124 tcp_options(newsk,skb->h.th);
3125
3126 tcp_cache_zap();
3127
3128 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3129 if (buff == NULL)
3130 {
3131 sk->err = ENOMEM;
3132 newsk->dead = 1;
3133 newsk->state = TCP_CLOSE;
3134 /* And this will destroy it */
3135 release_sock(newsk);
3136 kfree_skb(skb, FREE_READ);
3137 tcp_statistics.TcpAttemptFails++;
3138 return;
3139 }
3140
3141 buff->sk = newsk;
3142 buff->localroute = newsk->localroute;
3143
3144 /*
3145 * Put in the IP header and routing stuff.
3146 */
3147
3148 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3149 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&newsk->ip_route_cache);
3150
3151 /*
3152 * Something went wrong.
3153 */
3154
3155 if (tmp < 0)
3156 {
3157 sk->err = tmp;
3158 buff->free = 1;
3159 kfree_skb(buff,FREE_WRITE);
3160 newsk->dead = 1;
3161 newsk->state = TCP_CLOSE;
3162 release_sock(newsk);
3163 skb->sk = sk;
3164 kfree_skb(skb, FREE_READ);
3165 tcp_statistics.TcpAttemptFails++;
3166 return;
3167 }
3168
3169 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3170
3171 memcpy(t1, skb->h.th, sizeof(*t1));
3172 buff->seq = newsk->write_seq++;
3173 buff->end_seq = newsk->write_seq;
3174 /*
3175 * Swap the send and the receive.
3176 */
3177 t1->dest = skb->h.th->source;
3178 t1->source = newsk->dummy_th.source;
3179 t1->seq = ntohl(buff->seq);
3180 t1->ack = 1;
3181 newsk->window = tcp_select_window(newsk);
3182 newsk->sent_seq = newsk->write_seq;
3183 t1->window = ntohs(newsk->window);
3184 t1->res1 = 0;
3185 t1->res2 = 0;
3186 t1->rst = 0;
3187 t1->urg = 0;
3188 t1->psh = 0;
3189 t1->syn = 1;
3190 t1->ack_seq = htonl(newsk->acked_seq);
3191 t1->doff = sizeof(*t1)/4+1;
3192 ptr = skb_put(buff,4);
3193 ptr[0] = 2;
3194 ptr[1] = 4;
3195 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3196 ptr[3] =(newsk->mtu) & 0xff;
3197
3198 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3199 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3200 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3201 skb->sk = newsk;
3202
3203 /*
3204 * Charge the sock_buff to newsk.
3205 */
3206
3207 sk->rmem_alloc -= skb->truesize;
3208 newsk->rmem_alloc += skb->truesize;
3209
3210 skb_queue_tail(&sk->receive_queue,skb);
3211 sk->ack_backlog++;
3212 release_sock(newsk);
3213 tcp_statistics.TcpOutSegs++;
3214 }
3215
3216
3217 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3218 {
3219 /*
3220 * We need to grab some memory, and put together a FIN,
3221 * and then put it into the queue to be sent.
3222 */
3223
3224 sk->inuse = 1;
3225
3226 if(th_cache_sk==sk)
3227 tcp_cache_zap();
3228 if(sk->state == TCP_LISTEN)
3229 {
3230 /* Special case */
3231 tcp_set_state(sk, TCP_CLOSE);
3232 tcp_close_pending(sk);
3233 release_sock(sk);
3234 return;
3235 }
3236
3237 sk->keepopen = 1;
3238 sk->shutdown = SHUTDOWN_MASK;
3239
3240 if (!sk->dead)
3241 sk->state_change(sk);
3242
3243 if (timeout == 0)
3244 {
3245 struct sk_buff *skb;
3246
3247 /*
3248 * We need to flush the recv. buffs. We do this only on the
3249 * descriptor close, not protocol-sourced closes, because the
3250 * reader process may not have drained the data yet!
3251 */
3252
3253 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3254 kfree_skb(skb, FREE_READ);
3255 /*
3256 * Get rid off any half-completed packets.
3257 */
3258
3259 if (sk->partial)
3260 tcp_send_partial(sk);
3261 }
3262
3263
3264 /*
3265 * Timeout is not the same thing - however the code likes
3266 * to send both the same way (sigh).
3267 */
3268
3269 if(timeout)
3270 {
3271 tcp_set_state(sk, TCP_CLOSE); /* Dead */
3272 }
3273 else
3274 {
3275 if(tcp_close_state(sk,1)==1)
3276 {
3277 tcp_send_fin(sk);
3278 }
3279 }
3280 release_sock(sk);
3281 }
3282
3283
3284 /*
3285 * This routine takes stuff off of the write queue,
3286 * and puts it in the xmit queue. This happens as incoming acks
3287 * open up the remote window for us.
3288 */
3289
3290 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3291 {
3292 struct sk_buff *skb;
3293
3294 /*
3295 * The bytes will have to remain here. In time closedown will
3296 * empty the write queue and all will be happy
3297 */
3298
3299 if(sk->zapped)
3300 return;
3301
3302 /*
3303 * Anything on the transmit queue that fits the window can
3304 * be added providing we are not
3305 *
3306 * a) retransmitting (Nagle's rule)
3307 * b) exceeding our congestion window.
3308 */
3309
3310 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3311 before(skb->end_seq, sk->window_seq + 1) &&
3312 (sk->retransmits == 0 ||
3313 sk->ip_xmit_timeout != TIME_WRITE ||
3314 before(skb->end_seq, sk->rcv_ack_seq + 1))
3315 && sk->packets_out < sk->cong_window)
3316 {
3317 IS_SKB(skb);
3318 skb_unlink(skb);
3319
3320 /*
3321 * See if we really need to send the packet.
3322 */
3323
3324 if (before(skb->end_seq, sk->rcv_ack_seq +1))
3325 {
3326 /*
3327 * This is acked data. We can discard it. This
3328 * cannot currently occur.
3329 */
3330
3331 sk->retransmits = 0;
3332 kfree_skb(skb, FREE_WRITE);
3333 if (!sk->dead)
3334 sk->write_space(sk);
3335 }
3336 else
3337 {
3338 struct tcphdr *th;
3339 struct iphdr *iph;
3340 int size;
3341 /*
3342 * put in the ack seq and window at this point rather than earlier,
3343 * in order to keep them monotonic. We really want to avoid taking
3344 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3345 * Ack and window will in general have changed since this packet was put
3346 * on the write queue.
3347 */
3348 iph = skb->ip_hdr;
3349 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3350 size = skb->len - (((unsigned char *) th) - skb->data);
3351 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
3352 if (size > sk->mtu - sizeof(struct iphdr))
3353 {
3354 iph->frag_off &= ~htons(IP_DF);
3355 ip_send_check(iph);
3356 }
3357 #endif
3358
3359 th->ack_seq = htonl(sk->acked_seq);
3360 th->window = htons(tcp_select_window(sk));
3361
3362 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3363
3364 sk->sent_seq = skb->end_seq;
3365
3366 /*
3367 * IP manages our queue for some crazy reason
3368 */
3369
3370 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3371
3372 /*
3373 * Again we slide the timer wrongly
3374 */
3375
3376 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3377 }
3378 }
3379 }
3380
3381
3382 /*
3383 * This routine deals with incoming acks, but not outgoing ones.
3384 */
3385
3386 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3387 {
3388 u32 ack;
3389 int flag = 0;
3390
3391 /*
3392 * 1 - there was data in packet as well as ack or new data is sent or
3393 * in shutdown state
3394 * 2 - data from retransmit queue was acked and removed
3395 * 4 - window shrunk or data from retransmit queue was acked and removed
3396 */
3397
3398 if(sk->zapped)
3399 return(1); /* Dead, cant ack any more so why bother */
3400
3401 /*
3402 * Have we discovered a larger window
3403 */
3404
3405 ack = ntohl(th->ack_seq);
3406
3407 if (ntohs(th->window) > sk->max_window)
3408 {
3409 sk->max_window = ntohs(th->window);
3410 #ifdef CONFIG_INET_PCTCP
3411 /* Hack because we don't send partial packets to non SWS
3412 handling hosts */
3413 sk->mss = min(sk->max_window>>1, sk->mtu);
3414 #else
3415 sk->mss = min(sk->max_window, sk->mtu);
3416 #endif
3417 }
3418
3419 /*
3420 * We have dropped back to keepalive timeouts. Thus we have
3421 * no retransmits pending.
3422 */
3423
3424 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3425 sk->retransmits = 0;
3426
3427 /*
3428 * If the ack is newer than sent or older than previous acks
3429 * then we can probably ignore it.
3430 */
3431
3432 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3433 {
3434 if(sk->debug)
3435 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3436
3437 /*
3438 * Keepalive processing.
3439 */
3440
3441 if (after(ack, sk->sent_seq))
3442 {
3443 return(0);
3444 }
3445
3446 /*
3447 * Restart the keepalive timer.
3448 */
3449
3450 if (sk->keepopen)
3451 {
3452 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3453 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3454 }
3455 return(1);
3456 }
3457
3458 /*
3459 * If there is data set flag 1
3460 */
3461
3462 if (len != th->doff*4)
3463 flag |= 1;
3464
3465 /*
3466 * See if our window has been shrunk.
3467 */
3468
3469 if (after(sk->window_seq, ack+ntohs(th->window)))
3470 {
3471 /*
3472 * We may need to move packets from the send queue
3473 * to the write queue, if the window has been shrunk on us.
3474 * The RFC says you are not allowed to shrink your window
3475 * like this, but if the other end does, you must be able
3476 * to deal with it.
3477 */
3478 struct sk_buff *skb;
3479 struct sk_buff *skb2;
3480 struct sk_buff *wskb = NULL;
3481
3482 skb2 = sk->send_head;
3483 sk->send_head = NULL;
3484 sk->send_tail = NULL;
3485
3486 /*
3487 * This is an artifact of a flawed concept. We want one
3488 * queue and a smarter send routine when we send all.
3489 */
3490
3491 flag |= 4; /* Window changed */
3492
3493 sk->window_seq = ack + ntohs(th->window);
3494 cli();
3495 while (skb2 != NULL)
3496 {
3497 skb = skb2;
3498 skb2 = skb->link3;
3499 skb->link3 = NULL;
3500 if (after(skb->end_seq, sk->window_seq))
3501 {
3502 if (sk->packets_out > 0)
3503 sk->packets_out--;
3504 /* We may need to remove this from the dev send list. */
3505 if (skb->next != NULL)
3506 {
3507 skb_unlink(skb);
3508 }
3509 /* Now add it to the write_queue. */
3510 if (wskb == NULL)
3511 skb_queue_head(&sk->write_queue,skb);
3512 else
3513 skb_append(wskb,skb);
3514 wskb = skb;
3515 }
3516 else
3517 {
3518 if (sk->send_head == NULL)
3519 {
3520 sk->send_head = skb;
3521 sk->send_tail = skb;
3522 }
3523 else
3524 {
3525 sk->send_tail->link3 = skb;
3526 sk->send_tail = skb;
3527 }
3528 skb->link3 = NULL;
3529 }
3530 }
3531 sti();
3532 }
3533
3534 /*
3535 * Pipe has emptied
3536 */
3537
3538 if (sk->send_tail == NULL || sk->send_head == NULL)
3539 {
3540 sk->send_head = NULL;
3541 sk->send_tail = NULL;
3542 sk->packets_out= 0;
3543 }
3544
3545 /*
3546 * Update the right hand window edge of the host
3547 */
3548
3549 sk->window_seq = ack + ntohs(th->window);
3550
3551 /*
3552 * We don't want too many packets out there.
3553 */
3554
3555 if (sk->ip_xmit_timeout == TIME_WRITE &&
3556 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3557 {
3558 /*
3559 * This is Jacobson's slow start and congestion avoidance.
3560 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3561 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3562 * counter and increment it once every cwnd times. It's possible
3563 * that this should be done only if sk->retransmits == 0. I'm
3564 * interpreting "new data is acked" as including data that has
3565 * been retransmitted but is just now being acked.
3566 */
3567 if (sk->cong_window < sk->ssthresh)
3568 /*
3569 * In "safe" area, increase
3570 */
3571 sk->cong_window++;
3572 else
3573 {
3574 /*
3575 * In dangerous area, increase slowly. In theory this is
3576 * sk->cong_window += 1 / sk->cong_window
3577 */
3578 if (sk->cong_count >= sk->cong_window)
3579 {
3580 sk->cong_window++;
3581 sk->cong_count = 0;
3582 }
3583 else
3584 sk->cong_count++;
3585 }
3586 }
3587
3588 /*
3589 * Remember the highest ack received.
3590 */
3591
3592 sk->rcv_ack_seq = ack;
3593
3594 /*
3595 * If this ack opens up a zero window, clear backoff. It was
3596 * being used to time the probes, and is probably far higher than
3597 * it needs to be for normal retransmission.
3598 */
3599
3600 if (sk->ip_xmit_timeout == TIME_PROBE0)
3601 {
3602 sk->retransmits = 0; /* Our probe was answered */
3603
3604 /*
3605 * Was it a usable window open ?
3606 */
3607
3608 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3609 ! before (sk->window_seq, sk->write_queue.next->end_seq))
3610 {
3611 sk->backoff = 0;
3612
3613 /*
3614 * Recompute rto from rtt. this eliminates any backoff.
3615 */
3616
3617 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3618 if (sk->rto > 120*HZ)
3619 sk->rto = 120*HZ;
3620 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3621 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3622 .2 of a second is going to need huge windows (SIGH) */
3623 sk->rto = 20;
3624 }
3625 }
3626
3627 /*
3628 * See if we can take anything off of the retransmit queue.
3629 */
3630
3631 while(sk->send_head != NULL)
3632 {
3633 /* Check for a bug. */
3634 if (sk->send_head->link3 &&
3635 after(sk->send_head->end_seq, sk->send_head->link3->end_seq))
3636 printk("INET: tcp.c: *** bug send_list out of order.\n");
3637
3638 /*
3639 * If our packet is before the ack sequence we can
3640 * discard it as it's confirmed to have arrived the other end.
3641 */
3642
3643 if (before(sk->send_head->end_seq, ack+1))
3644 {
3645 struct sk_buff *oskb;
3646 if (sk->retransmits)
3647 {
3648 /*
3649 * We were retransmitting. don't count this in RTT est
3650 */
3651 flag |= 2;
3652
3653 /*
3654 * even though we've gotten an ack, we're still
3655 * retransmitting as long as we're sending from
3656 * the retransmit queue. Keeping retransmits non-zero
3657 * prevents us from getting new data interspersed with
3658 * retransmissions.
3659 */
3660
3661 if (sk->send_head->link3) /* Any more queued retransmits? */
3662 sk->retransmits = 1;
3663 else
3664 sk->retransmits = 0;
3665 }
3666 /*
3667 * Note that we only reset backoff and rto in the
3668 * rtt recomputation code. And that doesn't happen
3669 * if there were retransmissions in effect. So the
3670 * first new packet after the retransmissions is
3671 * sent with the backoff still in effect. Not until
3672 * we get an ack from a non-retransmitted packet do
3673 * we reset the backoff and rto. This allows us to deal
3674 * with a situation where the network delay has increased
3675 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3676 */
3677
3678 /*
3679 * We have one less packet out there.
3680 */
3681
3682 if (sk->packets_out > 0)
3683 sk->packets_out --;
3684 /*
3685 * Wake up the process, it can probably write more.
3686 */
3687 if (!sk->dead)
3688 sk->write_space(sk);
3689 oskb = sk->send_head;
3690
3691 if (!(flag&2)) /* Not retransmitting */
3692 {
3693 long m;
3694
3695 /*
3696 * The following amusing code comes from Jacobson's
3697 * article in SIGCOMM '88. Note that rtt and mdev
3698 * are scaled versions of rtt and mean deviation.
3699 * This is designed to be as fast as possible
3700 * m stands for "measurement".
3701 */
3702
3703 m = jiffies - oskb->when; /* RTT */
3704 if(m<=0)
3705 m=1; /* IS THIS RIGHT FOR <0 ??? */
3706 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3707 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3708 if (m < 0)
3709 m = -m; /* m is now abs(error) */
3710 m -= (sk->mdev >> 2); /* similar update on mdev */
3711 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3712
3713 /*
3714 * Now update timeout. Note that this removes any backoff.
3715 */
3716
3717 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3718 if (sk->rto > 120*HZ)
3719 sk->rto = 120*HZ;
3720 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3721 sk->rto = 20;
3722 sk->backoff = 0;
3723 }
3724 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3725 In this case as we just set it up */
3726 cli();
3727 oskb = sk->send_head;
3728 IS_SKB(oskb);
3729 sk->send_head = oskb->link3;
3730 if (sk->send_head == NULL)
3731 {
3732 sk->send_tail = NULL;
3733 }
3734
3735 /*
3736 * We may need to remove this from the dev send list.
3737 */
3738
3739 if (oskb->next)
3740 skb_unlink(oskb);
3741 sti();
3742 kfree_skb(oskb, FREE_WRITE); /* write. */
3743 if (!sk->dead)
3744 sk->write_space(sk);
3745 }
3746 else
3747 {
3748 break;
3749 }
3750 }
3751
3752 /*
3753 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3754 * returns non-NULL, we complete ignore the timer stuff in the else
3755 * clause. We ought to organize the code so that else clause can
3756 * (should) be executed regardless, possibly moving the PROBE timer
3757 * reset over. The skb_peek() thing should only move stuff to the
3758 * write queue, NOT also manage the timer functions.
3759 */
3760
3761 /*
3762 * Maybe we can take some stuff off of the write queue,
3763 * and put it onto the xmit queue.
3764 */
3765 if (skb_peek(&sk->write_queue) != NULL)
3766 {
3767 if (after (sk->window_seq+1, sk->write_queue.next->end_seq) &&
3768 (sk->retransmits == 0 ||
3769 sk->ip_xmit_timeout != TIME_WRITE ||
3770 before(sk->write_queue.next->end_seq, sk->rcv_ack_seq + 1))
3771 && sk->packets_out < sk->cong_window)
3772 {
3773 /*
3774 * Add more data to the send queue.
3775 */
3776 flag |= 1;
3777 tcp_write_xmit(sk);
3778 }
3779 else if (before(sk->window_seq, sk->write_queue.next->end_seq) &&
3780 sk->send_head == NULL &&
3781 sk->ack_backlog == 0 &&
3782 sk->state != TCP_TIME_WAIT)
3783 {
3784 /*
3785 * Data to queue but no room.
3786 */
3787 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3788 }
3789 }
3790 else
3791 {
3792 /*
3793 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3794 * from TCP_CLOSE we don't do anything
3795 *
3796 * from anything else, if there is write data (or fin) pending,
3797 * we use a TIME_WRITE timeout, else if keepalive we reset to
3798 * a KEEPALIVE timeout, else we delete the timer.
3799 *
3800 * We do not set flag for nominal write data, otherwise we may
3801 * force a state where we start to write itsy bitsy tidbits
3802 * of data.
3803 */
3804
3805 switch(sk->state) {
3806 case TCP_TIME_WAIT:
3807 /*
3808 * keep us in TIME_WAIT until we stop getting packets,
3809 * reset the timeout.
3810 */
3811 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3812 break;
3813 case TCP_CLOSE:
3814 /*
3815 * don't touch the timer.
3816 */
3817 break;
3818 default:
3819 /*
3820 * Must check send_head, write_queue, and ack_backlog
3821 * to determine which timeout to use.
3822 */
3823 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3824 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3825 } else if (sk->keepopen) {
3826 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3827 } else {
3828 del_timer(&sk->retransmit_timer);
3829 sk->ip_xmit_timeout = 0;
3830 }
3831 break;
3832 }
3833 }
3834
3835 /*
3836 * We have nothing queued but space to send. Send any partial
3837 * packets immediately (end of Nagle rule application).
3838 */
3839
3840 if (sk->packets_out == 0 && sk->partial != NULL &&
3841 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3842 {
3843 flag |= 1;
3844 tcp_send_partial(sk);
3845 }
3846
3847 /*
3848 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3849 * we are now waiting for an acknowledge to our FIN. The other end is
3850 * already in TIME_WAIT.
3851 *
3852 * Move to TCP_CLOSE on success.
3853 */
3854
3855 if (sk->state == TCP_LAST_ACK)
3856 {
3857 if (!sk->dead)
3858 sk->state_change(sk);
3859 if(sk->debug)
3860 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3861 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3862 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3863 {
3864 flag |= 1;
3865 tcp_set_state(sk,TCP_CLOSE);
3866 sk->shutdown = SHUTDOWN_MASK;
3867 }
3868 }
3869
3870 /*
3871 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3872 *
3873 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3874 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3875 */
3876
3877 if (sk->state == TCP_FIN_WAIT1)
3878 {
3879
3880 if (!sk->dead)
3881 sk->state_change(sk);
3882 if (sk->rcv_ack_seq == sk->write_seq)
3883 {
3884 flag |= 1;
3885 sk->shutdown |= SEND_SHUTDOWN;
3886 tcp_set_state(sk, TCP_FIN_WAIT2);
3887 }
3888 }
3889
3890 /*
3891 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3892 *
3893 * Move to TIME_WAIT
3894 */
3895
3896 if (sk->state == TCP_CLOSING)
3897 {
3898
3899 if (!sk->dead)
3900 sk->state_change(sk);
3901 if (sk->rcv_ack_seq == sk->write_seq)
3902 {
3903 flag |= 1;
3904 tcp_time_wait(sk);
3905 }
3906 }
3907
3908 /*
3909 * Final ack of a three way shake
3910 */
3911
3912 if(sk->state==TCP_SYN_RECV)
3913 {
3914 tcp_set_state(sk, TCP_ESTABLISHED);
3915 tcp_options(sk,th);
3916 sk->dummy_th.dest=th->source;
3917 sk->copied_seq = sk->acked_seq;
3918 if(!sk->dead)
3919 sk->state_change(sk);
3920 if(sk->max_window==0)
3921 {
3922 sk->max_window=32; /* Sanity check */
3923 sk->mss=min(sk->max_window,sk->mtu);
3924 }
3925 }
3926
3927 /*
3928 * I make no guarantees about the first clause in the following
3929 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3930 * what conditions "!flag" would be true. However I think the rest
3931 * of the conditions would prevent that from causing any
3932 * unnecessary retransmission.
3933 * Clearly if the first packet has expired it should be
3934 * retransmitted. The other alternative, "flag&2 && retransmits", is
3935 * harder to explain: You have to look carefully at how and when the
3936 * timer is set and with what timeout. The most recent transmission always
3937 * sets the timer. So in general if the most recent thing has timed
3938 * out, everything before it has as well. So we want to go ahead and
3939 * retransmit some more. If we didn't explicitly test for this
3940 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3941 * would not be true. If you look at the pattern of timing, you can
3942 * show that rto is increased fast enough that the next packet would
3943 * almost never be retransmitted immediately. Then you'd end up
3944 * waiting for a timeout to send each packet on the retransmission
3945 * queue. With my implementation of the Karn sampling algorithm,
3946 * the timeout would double each time. The net result is that it would
3947 * take a hideous amount of time to recover from a single dropped packet.
3948 * It's possible that there should also be a test for TIME_WRITE, but
3949 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3950 * got to be in real retransmission mode.
3951 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3952 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3953 * As long as no further losses occur, this seems reasonable.
3954 */
3955
3956 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3957 (((flag&2) && sk->retransmits) ||
3958 (sk->send_head->when + sk->rto < jiffies)))
3959 {
3960 if(sk->send_head->when + sk->rto < jiffies)
3961 tcp_retransmit(sk,0);
3962 else
3963 {
3964 tcp_do_retransmit(sk, 1);
3965 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3966 }
3967 }
3968
3969 return(1);
3970 }
3971
3972
3973 /*
3974 * Process the FIN bit. This now behaves as it is supposed to work
3975 * and the FIN takes effect when it is validly part of sequence
3976 * space. Not before when we get holes.
3977 *
3978 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3979 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3980 * TIME-WAIT)
3981 *
3982 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3983 * close and we go into CLOSING (and later onto TIME-WAIT)
3984 *
3985 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3986 *
3987 */
3988
3989 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3990 {
3991 sk->fin_seq = skb->end_seq;
3992
3993 if (!sk->dead)
3994 {
3995 sk->state_change(sk);
3996 sock_wake_async(sk->socket, 1);
3997 }
3998
3999 switch(sk->state)
4000 {
4001 case TCP_SYN_RECV:
4002 case TCP_SYN_SENT:
4003 case TCP_ESTABLISHED:
4004 /*
4005 * move to CLOSE_WAIT, tcp_data() already handled
4006 * sending the ack.
4007 */
4008 tcp_set_state(sk,TCP_CLOSE_WAIT);
4009 if (th->rst)
4010 sk->shutdown = SHUTDOWN_MASK;
4011 break;
4012
4013 case TCP_CLOSE_WAIT:
4014 case TCP_CLOSING:
4015 /*
4016 * received a retransmission of the FIN, do
4017 * nothing.
4018 */
4019 break;
4020 case TCP_TIME_WAIT:
4021 /*
4022 * received a retransmission of the FIN,
4023 * restart the TIME_WAIT timer.
4024 */
4025 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4026 return(0);
4027 case TCP_FIN_WAIT1:
4028 /*
4029 * This case occurs when a simultaneous close
4030 * happens, we must ack the received FIN and
4031 * enter the CLOSING state.
4032 *
4033 * This causes a WRITE timeout, which will either
4034 * move on to TIME_WAIT when we timeout, or resend
4035 * the FIN properly (maybe we get rid of that annoying
4036 * FIN lost hang). The TIME_WRITE code is already correct
4037 * for handling this timeout.
4038 */
4039
4040 if(sk->ip_xmit_timeout != TIME_WRITE)
4041 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4042 tcp_set_state(sk,TCP_CLOSING);
4043 break;
4044 case TCP_FIN_WAIT2:
4045 /*
4046 * received a FIN -- send ACK and enter TIME_WAIT
4047 */
4048 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4049 sk->shutdown|=SHUTDOWN_MASK;
4050 tcp_set_state(sk,TCP_TIME_WAIT);
4051 break;
4052 case TCP_CLOSE:
4053 /*
4054 * already in CLOSE
4055 */
4056 break;
4057 default:
4058 tcp_set_state(sk,TCP_LAST_ACK);
4059
4060 /* Start the timers. */
4061 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
4062 return(0);
4063 }
4064
4065 return(0);
4066 }
4067
4068
4069
4070 /*
4071 * This routine handles the data. If there is room in the buffer,
4072 * it will be have already been moved into it. If there is no
4073 * room, then we will just have to discard the packet.
4074 */
4075
4076 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4077 unsigned long saddr, unsigned short len)
4078 {
4079 struct sk_buff *skb1, *skb2;
4080 struct tcphdr *th;
4081 int dup_dumped=0;
4082 u32 new_seq, shut_seq;
4083
4084 th = skb->h.th;
4085 skb_pull(skb,th->doff*4);
4086 skb_trim(skb,len-(th->doff*4));
4087
4088 /*
4089 * The bytes in the receive read/assembly queue has increased. Needed for the
4090 * low memory discard algorithm
4091 */
4092
4093 sk->bytes_rcv += skb->len;
4094
4095 if (skb->len == 0 && !th->fin)
4096 {
4097 /*
4098 * Don't want to keep passing ack's back and forth.
4099 * (someone sent us dataless, boring frame)
4100 */
4101 if (!th->ack)
4102 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4103 kfree_skb(skb, FREE_READ);
4104 return(0);
4105 }
4106
4107 /*
4108 * We no longer have anyone receiving data on this connection.
4109 */
4110
4111 #ifndef TCP_DONT_RST_SHUTDOWN
4112
4113 if(sk->shutdown & RCV_SHUTDOWN)
4114 {
4115 /*
4116 * FIXME: BSD has some magic to avoid sending resets to
4117 * broken 4.2 BSD keepalives. Much to my surprise a few non
4118 * BSD stacks still have broken keepalives so we want to
4119 * cope with it.
4120 */
4121
4122 if(skb->len) /* We don't care if it's just an ack or
4123 a keepalive/window probe */
4124 {
4125 new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
4126
4127 /* Do this the way 4.4BSD treats it. Not what I'd
4128 regard as the meaning of the spec but it's what BSD
4129 does and clearly they know everything 8) */
4130
4131 /*
4132 * This is valid because of two things
4133 *
4134 * a) The way tcp_data behaves at the bottom.
4135 * b) A fin takes effect when read not when received.
4136 */
4137
4138 shut_seq = sk->acked_seq+1; /* Last byte */
4139
4140 if(after(new_seq,shut_seq))
4141 {
4142 if(sk->debug)
4143 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4144 sk, new_seq, shut_seq, sk->blog);
4145 if(sk->dead)
4146 {
4147 sk->acked_seq = new_seq + th->fin;
4148 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4149 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4150 tcp_statistics.TcpEstabResets++;
4151 tcp_set_state(sk,TCP_CLOSE);
4152 sk->err = EPIPE;
4153 sk->shutdown = SHUTDOWN_MASK;
4154 kfree_skb(skb, FREE_READ);
4155 return 0;
4156 }
4157 }
4158 }
4159 }
4160
4161 #endif
4162
4163 /*
4164 * Now we have to walk the chain, and figure out where this one
4165 * goes into it. This is set up so that the last packet we received
4166 * will be the first one we look at, that way if everything comes
4167 * in order, there will be no performance loss, and if they come
4168 * out of order we will be able to fit things in nicely.
4169 *
4170 * [AC: This is wrong. We should assume in order first and then walk
4171 * forwards from the first hole based upon real traffic patterns.]
4172 *
4173 */
4174
4175 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
4176 {
4177 skb_queue_head(&sk->receive_queue,skb);
4178 skb1= NULL;
4179 }
4180 else
4181 {
4182 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4183 {
4184 if(sk->debug)
4185 {
4186 printk("skb1=%p :", skb1);
4187 printk("skb1->seq = %d: ", skb1->seq);
4188 printk("skb->seq = %d\n",skb->seq);
4189 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4190 sk->acked_seq);
4191 }
4192
4193 /*
4194 * Optimisation: Duplicate frame or extension of previous frame from
4195 * same sequence point (lost ack case).
4196 * The frame contains duplicate data or replaces a previous frame
4197 * discard the previous frame (safe as sk->inuse is set) and put
4198 * the new one in its place.
4199 */
4200
4201 if (skb->seq==skb1->seq && skb->len>=skb1->len)
4202 {
4203 skb_append(skb1,skb);
4204 skb_unlink(skb1);
4205 kfree_skb(skb1,FREE_READ);
4206 dup_dumped=1;
4207 skb1=NULL;
4208 break;
4209 }
4210
4211 /*
4212 * Found where it fits
4213 */
4214
4215 if (after(skb->seq+1, skb1->seq))
4216 {
4217 skb_append(skb1,skb);
4218 break;
4219 }
4220
4221 /*
4222 * See if we've hit the start. If so insert.
4223 */
4224 if (skb1 == skb_peek(&sk->receive_queue))
4225 {
4226 skb_queue_head(&sk->receive_queue, skb);
4227 break;
4228 }
4229 }
4230 }
4231
4232 /*
4233 * Figure out what the ack value for this frame is
4234 */
4235
4236 if (before(sk->acked_seq, sk->copied_seq))
4237 {
4238 printk("*** tcp.c:tcp_data bug acked < copied\n");
4239 sk->acked_seq = sk->copied_seq;
4240 }
4241
4242 /*
4243 * Now figure out if we can ack anything. This is very messy because we really want two
4244 * receive queues, a completed and an assembly queue. We also want only one transmit
4245 * queue.
4246 */
4247
4248 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(skb->seq, sk->acked_seq+1))
4249 {
4250 if (before(skb->seq, sk->acked_seq+1))
4251 {
4252 int newwindow;
4253
4254 if (after(skb->end_seq, sk->acked_seq))
4255 {
4256 newwindow = sk->window - (skb->end_seq - sk->acked_seq);
4257 if (newwindow < 0)
4258 newwindow = 0;
4259 sk->window = newwindow;
4260 sk->acked_seq = skb->end_seq;
4261 }
4262 skb->acked = 1;
4263
4264 /*
4265 * When we ack the fin, we do the FIN
4266 * processing.
4267 */
4268
4269 if (skb->h.th->fin)
4270 {
4271 tcp_fin(skb,sk,skb->h.th);
4272 }
4273
4274 for(skb2 = skb->next;
4275 skb2 != (struct sk_buff *)&sk->receive_queue;
4276 skb2 = skb2->next)
4277 {
4278 if (before(skb2->seq, sk->acked_seq+1))
4279 {
4280 if (after(skb2->end_seq, sk->acked_seq))
4281 {
4282 newwindow = sk->window -
4283 (skb2->end_seq - sk->acked_seq);
4284 if (newwindow < 0)
4285 newwindow = 0;
4286 sk->window = newwindow;
4287 sk->acked_seq = skb2->end_seq;
4288 }
4289 skb2->acked = 1;
4290 /*
4291 * When we ack the fin, we do
4292 * the fin handling.
4293 */
4294 if (skb2->h.th->fin)
4295 {
4296 tcp_fin(skb,sk,skb->h.th);
4297 }
4298
4299 /*
4300 * Force an immediate ack.
4301 */
4302
4303 sk->ack_backlog = sk->max_ack_backlog;
4304 }
4305 else
4306 {
4307 break;
4308 }
4309 }
4310
4311 /*
4312 * This also takes care of updating the window.
4313 * This if statement needs to be simplified.
4314 */
4315 if (!sk->delay_acks ||
4316 sk->ack_backlog >= sk->max_ack_backlog ||
4317 sk->bytes_rcv > sk->max_unacked || th->fin) {
4318 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4319 }
4320 else
4321 {
4322 sk->ack_backlog++;
4323 if(sk->debug)
4324 printk("Ack queued.\n");
4325 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4326 }
4327 }
4328 }
4329
4330 /*
4331 * If we've missed a packet, send an ack.
4332 * Also start a timer to send another.
4333 */
4334
4335 if (!skb->acked)
4336 {
4337
4338 /*
4339 * This is important. If we don't have much room left,
4340 * we need to throw out a few packets so we have a good
4341 * window. Note that mtu is used, not mss, because mss is really
4342 * for the send side. He could be sending us stuff as large as mtu.
4343 */
4344
4345 while (sock_rspace(sk) < sk->mtu)
4346 {
4347 skb1 = skb_peek(&sk->receive_queue);
4348 if (skb1 == NULL)
4349 {
4350 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4351 break;
4352 }
4353
4354 /*
4355 * Don't throw out something that has been acked.
4356 */
4357
4358 if (skb1->acked)
4359 {
4360 break;
4361 }
4362
4363 skb_unlink(skb1);
4364 kfree_skb(skb1, FREE_READ);
4365 }
4366 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4367 sk->ack_backlog++;
4368 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4369 }
4370 else
4371 {
4372 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4373 }
4374
4375 /*
4376 * Now tell the user we may have some data.
4377 */
4378
4379 if (!sk->dead)
4380 {
4381 if(sk->debug)
4382 printk("Data wakeup.\n");
4383 sk->data_ready(sk,0);
4384 }
4385 return(0);
4386 }
4387
4388
4389 /*
4390 * This routine is only called when we have urgent data
4391 * signalled. Its the 'slow' part of tcp_urg. It could be
4392 * moved inline now as tcp_urg is only called from one
4393 * place. We handle URGent data wrong. We have to - as
4394 * BSD still doesn't use the correction from RFC961.
4395 */
4396
4397 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4398 {
4399 u32 ptr = ntohs(th->urg_ptr);
4400
4401 if (ptr)
4402 ptr--;
4403 ptr += ntohl(th->seq);
4404
4405 /* ignore urgent data that we've already seen and read */
4406 if (after(sk->copied_seq, ptr))
4407 return;
4408
4409 /* do we already have a newer (or duplicate) urgent pointer? */
4410 if (sk->urg_data && !after(ptr, sk->urg_seq))
4411 return;
4412
4413 /* tell the world about our new urgent pointer */
4414 if (sk->proc != 0) {
4415 if (sk->proc > 0) {
4416 kill_proc(sk->proc, SIGURG, 1);
4417 } else {
4418 kill_pg(-sk->proc, SIGURG, 1);
4419 }
4420 }
4421 sk->urg_data = URG_NOTYET;
4422 sk->urg_seq = ptr;
4423 }
4424
4425 /*
4426 * This is the 'fast' part of urgent handling.
4427 */
4428
4429 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4430 unsigned long saddr, unsigned long len)
4431 {
4432 u32 ptr;
4433
4434 /*
4435 * Check if we get a new urgent pointer - normally not
4436 */
4437
4438 if (th->urg)
4439 tcp_check_urg(sk,th);
4440
4441 /*
4442 * Do we wait for any urgent data? - normally not
4443 */
4444
4445 if (sk->urg_data != URG_NOTYET)
4446 return 0;
4447
4448 /*
4449 * Is the urgent pointer pointing into this packet?
4450 */
4451
4452 ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
4453 if (ptr >= len)
4454 return 0;
4455
4456 /*
4457 * Ok, got the correct packet, update info
4458 */
4459
4460 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4461 if (!sk->dead)
4462 sk->data_ready(sk,0);
4463 return 0;
4464 }
4465
4466 /*
4467 * This will accept the next outstanding connection.
4468 */
4469
4470 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4471 {
4472 struct sock *newsk;
4473 struct sk_buff *skb;
4474
4475 /*
4476 * We need to make sure that this socket is listening,
4477 * and that it has something pending.
4478 */
4479
4480 if (sk->state != TCP_LISTEN)
4481 {
4482 sk->err = EINVAL;
4483 return(NULL);
4484 }
4485
4486 /* Avoid the race. */
4487 cli();
4488 sk->inuse = 1;
4489
4490 while((skb = tcp_dequeue_established(sk)) == NULL)
4491 {
4492 if (flags & O_NONBLOCK)
4493 {
4494 sti();
4495 release_sock(sk);
4496 sk->err = EAGAIN;
4497 return(NULL);
4498 }
4499
4500 release_sock(sk);
4501 interruptible_sleep_on(sk->sleep);
4502 if (current->signal & ~current->blocked)
4503 {
4504 sti();
4505 sk->err = ERESTARTSYS;
4506 return(NULL);
4507 }
4508 sk->inuse = 1;
4509 }
4510 sti();
4511
4512 /*
4513 * Now all we need to do is return skb->sk.
4514 */
4515
4516 newsk = skb->sk;
4517
4518 kfree_skb(skb, FREE_READ);
4519 sk->ack_backlog--;
4520 release_sock(sk);
4521 return(newsk);
4522 }
4523
4524
4525 /*
4526 * This will initiate an outgoing connection.
4527 */
4528
4529 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4530 {
4531 struct sk_buff *buff;
4532 struct device *dev=NULL;
4533 unsigned char *ptr;
4534 int tmp;
4535 int atype;
4536 struct tcphdr *t1;
4537 struct rtable *rt;
4538
4539 if (sk->state != TCP_CLOSE)
4540 return(-EISCONN);
4541
4542 /*
4543 * Don't allow a double connect.
4544 */
4545
4546 if(sk->daddr)
4547 return -EINVAL;
4548
4549 if (addr_len < 8)
4550 return(-EINVAL);
4551
4552 if (usin->sin_family && usin->sin_family != AF_INET)
4553 return(-EAFNOSUPPORT);
4554
4555 /*
4556 * connect() to INADDR_ANY means loopback (BSD'ism).
4557 */
4558
4559 if(usin->sin_addr.s_addr==INADDR_ANY)
4560 usin->sin_addr.s_addr=ip_my_addr();
4561
4562 /*
4563 * Don't want a TCP connection going to a broadcast address
4564 */
4565
4566 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4567 return -ENETUNREACH;
4568
4569 sk->inuse = 1;
4570 sk->daddr = usin->sin_addr.s_addr;
4571 sk->write_seq = tcp_init_seq();
4572 sk->window_seq = sk->write_seq;
4573 sk->rcv_ack_seq = sk->write_seq -1;
4574 sk->err = 0;
4575 sk->dummy_th.dest = usin->sin_port;
4576 release_sock(sk);
4577
4578 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4579 if (buff == NULL)
4580 {
4581 return(-ENOMEM);
4582 }
4583 sk->inuse = 1;
4584 buff->sk = sk;
4585 buff->free = 0;
4586 buff->localroute = sk->localroute;
4587
4588
4589 /*
4590 * Put in the IP header and routing stuff.
4591 */
4592
4593 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4594 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
4595 if (tmp < 0)
4596 {
4597 sock_wfree(sk, buff);
4598 release_sock(sk);
4599 return(-ENETUNREACH);
4600 }
4601 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
4602 sk->saddr = rt->rt_src;
4603 sk->rcv_saddr = sk->saddr;
4604
4605 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4606
4607 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4608 buff->seq = sk->write_seq++;
4609 t1->seq = htonl(buff->seq);
4610 sk->sent_seq = sk->write_seq;
4611 buff->end_seq = sk->write_seq;
4612 t1->ack = 0;
4613 t1->window = 2;
4614 t1->res1=0;
4615 t1->res2=0;
4616 t1->rst = 0;
4617 t1->urg = 0;
4618 t1->psh = 0;
4619 t1->syn = 1;
4620 t1->urg_ptr = 0;
4621 t1->doff = 6;
4622 /* use 512 or whatever user asked for */
4623
4624 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4625 sk->window_clamp=rt->rt_window;
4626 else
4627 sk->window_clamp=0;
4628
4629 if (sk->user_mss)
4630 sk->mtu = sk->user_mss;
4631 else if (rt)
4632 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
4633 else
4634 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4635
4636 /*
4637 * but not bigger than device MTU
4638 */
4639
4640 if(sk->mtu <32)
4641 sk->mtu = 32; /* Sanity limit */
4642
4643 sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4644
4645 #ifdef CONFIG_SKIP
4646
4647 /*
4648 * SKIP devices set their MTU to 65535. This is so they can take packets
4649 * unfragmented to security process then fragment. They could lie to the
4650 * TCP layer about a suitable MTU, but its easier to let skip sort it out
4651 * simply because the final package we want unfragmented is going to be
4652 *
4653 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
4654 */
4655
4656 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
4657 sk->mtu=skip_pick_mtu(sk->mtu,dev);
4658 #endif
4659
4660 /*
4661 * Put in the TCP options to say MTU.
4662 */
4663
4664 ptr = skb_put(buff,4);
4665 ptr[0] = 2;
4666 ptr[1] = 4;
4667 ptr[2] = (sk->mtu) >> 8;
4668 ptr[3] = (sk->mtu) & 0xff;
4669 tcp_send_check(t1, sk->saddr, sk->daddr,
4670 sizeof(struct tcphdr) + 4, sk);
4671
4672 /*
4673 * This must go first otherwise a really quick response will get reset.
4674 */
4675
4676 tcp_cache_zap();
4677 tcp_set_state(sk,TCP_SYN_SENT);
4678 if(rt&&rt->rt_flags&RTF_IRTT)
4679 sk->rto = rt->rt_irtt;
4680 else
4681 sk->rto = TCP_TIMEOUT_INIT;
4682 sk->retransmit_timer.function=&retransmit_timer;
4683 sk->retransmit_timer.data = (unsigned long)sk;
4684 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4685 sk->retransmits = 0; /* Now works the right way instead of a hacked
4686 initial setting */
4687
4688 sk->prot->queue_xmit(sk, dev, buff, 0);
4689 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4690 tcp_statistics.TcpActiveOpens++;
4691 tcp_statistics.TcpOutSegs++;
4692
4693 release_sock(sk);
4694 return(0);
4695 }
4696
4697
4698 /*
4699 * This functions checks to see if the tcp header is actually acceptable.
4700 */
4701
4702 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4703 struct options *opt, unsigned long saddr, struct device *dev)
4704 {
4705 u32 next_seq;
4706
4707 next_seq = len - 4*th->doff;
4708 if (th->fin)
4709 next_seq++;
4710 /* if we have a zero window, we can't have any data in the packet.. */
4711 if (next_seq && !sk->window)
4712 goto ignore_it;
4713 next_seq += ntohl(th->seq);
4714
4715 /*
4716 * This isn't quite right. sk->acked_seq could be more recent
4717 * than sk->window. This is however close enough. We will accept
4718 * slightly more packets than we should, but it should not cause
4719 * problems unless someone is trying to forge packets.
4720 */
4721
4722 /* have we already seen all of this packet? */
4723 if (!after(next_seq+1, sk->acked_seq))
4724 goto ignore_it;
4725 /* or does it start beyond the window? */
4726 if (!before(ntohl(th->seq), sk->acked_seq + sk->window + 1))
4727 goto ignore_it;
4728
4729 /* ok, at least part of this packet would seem interesting.. */
4730 return 1;
4731
4732 ignore_it:
4733 if (th->rst)
4734 return 0;
4735
4736 /*
4737 * Send a reset if we get something not ours and we are
4738 * unsynchronized. Note: We don't do anything to our end. We
4739 * are just killing the bogus remote connection then we will
4740 * connect again and it will work (with luck).
4741 */
4742
4743 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4744 {
4745 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4746 return 1;
4747 }
4748
4749 /* Try to resync things. */
4750 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4751 return 0;
4752 }
4753
4754 /*
4755 * When we get a reset we do this.
4756 */
4757
4758 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4759 {
4760 sk->zapped = 1;
4761 sk->err = ECONNRESET;
4762 if (sk->state == TCP_SYN_SENT)
4763 sk->err = ECONNREFUSED;
4764 if (sk->state == TCP_CLOSE_WAIT)
4765 sk->err = EPIPE;
4766 #ifdef TCP_DO_RFC1337
4767 /*
4768 * Time wait assassination protection [RFC1337]
4769 */
4770 if(sk->state!=TCP_TIME_WAIT)
4771 {
4772 tcp_set_state(sk,TCP_CLOSE);
4773 sk->shutdown = SHUTDOWN_MASK;
4774 }
4775 #else
4776 tcp_set_state(sk,TCP_CLOSE);
4777 sk->shutdown = SHUTDOWN_MASK;
4778 #endif
4779 if (!sk->dead)
4780 sk->state_change(sk);
4781 kfree_skb(skb, FREE_READ);
4782 release_sock(sk);
4783 return(0);
4784 }
4785
4786 /*
4787 * A TCP packet has arrived.
4788 * skb->h.raw is the TCP header.
4789 */
4790
4791 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4792 __u32 daddr, unsigned short len,
4793 __u32 saddr, int redo, struct inet_protocol * protocol)
4794 {
4795 struct tcphdr *th;
4796 struct sock *sk;
4797 int syn_ok=0;
4798
4799 tcp_statistics.TcpInSegs++;
4800 if(skb->pkt_type!=PACKET_HOST)
4801 {
4802 kfree_skb(skb,FREE_READ);
4803 return(0);
4804 }
4805
4806 th = skb->h.th;
4807
4808 /*
4809 * Find the socket, using the last hit cache if applicable.
4810 */
4811
4812 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4813 {
4814 sk=(struct sock *)th_cache_sk;
4815 /*
4816 * We think this is causing the bug so
4817 */
4818 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4819 printk("Cache mismatch on TCP.\n");
4820 }
4821 else
4822 {
4823 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4824 th_cache_saddr=saddr;
4825 th_cache_daddr=daddr;
4826 th_cache_dport=th->dest;
4827 th_cache_sport=th->source;
4828 th_cache_sk=sk;
4829 }
4830
4831 /*
4832 * If this socket has got a reset it's to all intents and purposes
4833 * really dead. Count closed sockets as dead.
4834 *
4835 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4836 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4837 * exist so should cause resets as if the port was unreachable.
4838 */
4839
4840 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4841 sk=NULL;
4842
4843 if (!redo)
4844 {
4845 /*
4846 * Pull up the IP header.
4847 */
4848 skb_pull(skb, skb->h.raw-skb->data);
4849 /*
4850 * Try to use the device checksum if provided.
4851 */
4852 if (
4853 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4854 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4855 )
4856 {
4857 skb->sk = NULL;
4858 kfree_skb(skb,FREE_READ);
4859 /*
4860 * We don't release the socket because it was
4861 * never marked in use.
4862 */
4863 return(0);
4864 }
4865
4866 skb->seq = ntohl(th->seq);
4867 skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
4868 skb->ack_seq = ntohl(th->ack_seq);
4869
4870 /* See if we know about the socket. */
4871 if (sk == NULL)
4872 {
4873 /*
4874 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4875 */
4876 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4877 skb->sk = NULL;
4878 /*
4879 * Discard frame
4880 */
4881 kfree_skb(skb, FREE_READ);
4882 return(0);
4883 }
4884
4885 skb->acked = 0;
4886 skb->used = 0;
4887 skb->free = 0;
4888 skb->saddr = daddr;
4889 skb->daddr = saddr;
4890
4891 /* We may need to add it to the backlog here. */
4892 cli();
4893 if (sk->inuse)
4894 {
4895 skb_queue_tail(&sk->back_log, skb);
4896 sti();
4897 return(0);
4898 }
4899 sk->inuse = 1;
4900 sti();
4901 }
4902 else
4903 {
4904 if (sk==NULL)
4905 {
4906 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4907 skb->sk = NULL;
4908 kfree_skb(skb, FREE_READ);
4909 return(0);
4910 }
4911 }
4912
4913
4914 if (!sk->prot)
4915 {
4916 printk("IMPOSSIBLE 3\n");
4917 return(0);
4918 }
4919
4920
4921 /*
4922 * Charge the memory to the socket.
4923 */
4924
4925 skb->sk=sk;
4926 sk->rmem_alloc += skb->truesize;
4927
4928 /*
4929 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4930 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4931 * compatibility. We also set up variables more thoroughly [Karn notes in the
4932 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4933 */
4934
4935 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4936 {
4937
4938 /*
4939 * Now deal with unusual cases.
4940 */
4941
4942 if(sk->state==TCP_LISTEN)
4943 {
4944 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4945 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4946
4947 /*
4948 * We don't care for RST, and non SYN are absorbed (old segments)
4949 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4950 * netmask on a running connection it can go broadcast. Even Sun's have
4951 * this problem so I'm ignoring it
4952 */
4953
4954 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4955 {
4956 kfree_skb(skb, FREE_READ);
4957 release_sock(sk);
4958 return 0;
4959 }
4960
4961 /*
4962 * Guess we need to make a new socket up
4963 */
4964
4965 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4966
4967 /*
4968 * Now we have several options: In theory there is nothing else
4969 * in the frame. KA9Q has an option to send data with the syn,
4970 * BSD accepts data with the syn up to the [to be] advertised window
4971 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4972 * it, that fits the spec precisely and avoids incompatibilities. It
4973 * would be nice in future to drop through and process the data.
4974 */
4975
4976 release_sock(sk);
4977 return 0;
4978 }
4979
4980 /* retransmitted SYN? */
4981 if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
4982 {
4983 kfree_skb(skb, FREE_READ);
4984 release_sock(sk);
4985 return 0;
4986 }
4987
4988 /*
4989 * SYN sent means we have to look for a suitable ack and either reset
4990 * for bad matches or go to connected
4991 */
4992
4993 if(sk->state==TCP_SYN_SENT)
4994 {
4995 /* Crossed SYN or previous junk segment */
4996 if(th->ack)
4997 {
4998 /* We got an ack, but it's not a good ack */
4999 if(!tcp_ack(sk,th,saddr,len))
5000 {
5001 /* Reset the ack - its an ack from a
5002 different connection [ th->rst is checked in tcp_reset()] */
5003 tcp_statistics.TcpAttemptFails++;
5004 tcp_reset(daddr, saddr, th,
5005 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
5006 kfree_skb(skb, FREE_READ);
5007 release_sock(sk);
5008 return(0);
5009 }
5010 if(th->rst)
5011 return tcp_std_reset(sk,skb);
5012 if(!th->syn)
5013 {
5014 /* A valid ack from a different connection
5015 start. Shouldn't happen but cover it */
5016 kfree_skb(skb, FREE_READ);
5017 release_sock(sk);
5018 return 0;
5019 }
5020 /*
5021 * Ok.. it's good. Set up sequence numbers and
5022 * move to established.
5023 */
5024 syn_ok=1; /* Don't reset this connection for the syn */
5025 sk->acked_seq = skb->seq+1;
5026 sk->fin_seq = skb->seq;
5027 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
5028 tcp_set_state(sk, TCP_ESTABLISHED);
5029 tcp_options(sk,th);
5030 sk->dummy_th.dest=th->source;
5031 sk->copied_seq = sk->acked_seq;
5032 if(!sk->dead)
5033 {
5034 sk->state_change(sk);
5035 sock_wake_async(sk->socket, 0);
5036 }
5037 if(sk->max_window==0)
5038 {
5039 sk->max_window = 32;
5040 sk->mss = min(sk->max_window, sk->mtu);
5041 }
5042 }
5043 else
5044 {
5045 /* See if SYN's cross. Drop if boring */
5046 if(th->syn && !th->rst)
5047 {
5048 /* Crossed SYN's are fine - but talking to
5049 yourself is right out... */
5050 if(sk->saddr==saddr && sk->daddr==daddr &&
5051 sk->dummy_th.source==th->source &&
5052 sk->dummy_th.dest==th->dest)
5053 {
5054 tcp_statistics.TcpAttemptFails++;
5055 return tcp_std_reset(sk,skb);
5056 }
5057 tcp_set_state(sk,TCP_SYN_RECV);
5058
5059 /*
5060 * FIXME:
5061 * Must send SYN|ACK here
5062 */
5063 }
5064 /* Discard junk segment */
5065 kfree_skb(skb, FREE_READ);
5066 release_sock(sk);
5067 return 0;
5068 }
5069 /*
5070 * SYN_RECV with data maybe.. drop through
5071 */
5072 goto rfc_step6;
5073 }
5074
5075 /*
5076 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5077 * a more complex suggestion for fixing these reuse issues in RFC1644
5078 * but not yet ready for general use. Also see RFC1379.
5079 */
5080
5081 #define BSD_TIME_WAIT
5082 #ifdef BSD_TIME_WAIT
5083 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5084 after(skb->seq, sk->acked_seq) && !th->rst)
5085 {
5086 u32 seq = sk->write_seq;
5087 if(sk->debug)
5088 printk("Doing a BSD time wait\n");
5089 tcp_statistics.TcpEstabResets++;
5090 sk->rmem_alloc -= skb->truesize;
5091 skb->sk = NULL;
5092 sk->err=ECONNRESET;
5093 tcp_set_state(sk, TCP_CLOSE);
5094 sk->shutdown = SHUTDOWN_MASK;
5095 release_sock(sk);
5096 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5097 if (sk && sk->state==TCP_LISTEN)
5098 {
5099 sk->inuse=1;
5100 skb->sk = sk;
5101 sk->rmem_alloc += skb->truesize;
5102 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5103 release_sock(sk);
5104 return 0;
5105 }
5106 kfree_skb(skb, FREE_READ);
5107 return 0;
5108 }
5109 #endif
5110 }
5111
5112 /*
5113 * We are now in normal data flow (see the step list in the RFC)
5114 * Note most of these are inline now. I'll inline the lot when
5115 * I have time to test it hard and look at what gcc outputs
5116 */
5117
5118 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5119 {
5120 kfree_skb(skb, FREE_READ);
5121 release_sock(sk);
5122 return 0;
5123 }
5124
5125 if(th->rst)
5126 return tcp_std_reset(sk,skb);
5127
5128 /*
5129 * !syn_ok is effectively the state test in RFC793.
5130 */
5131
5132 if(th->syn && !syn_ok)
5133 {
5134 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5135 return tcp_std_reset(sk,skb);
5136 }
5137
5138 /*
5139 * Process the ACK
5140 */
5141
5142
5143 if(th->ack && !tcp_ack(sk,th,saddr,len))
5144 {
5145 /*
5146 * Our three way handshake failed.
5147 */
5148
5149 if(sk->state==TCP_SYN_RECV)
5150 {
5151 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5152 }
5153 kfree_skb(skb, FREE_READ);
5154 release_sock(sk);
5155 return 0;
5156 }
5157
5158 rfc_step6: /* I'll clean this up later */
5159
5160 /*
5161 * If the accepted buffer put us over our queue size we
5162 * now drop it (we must process the ack first to avoid
5163 * deadlock cases).
5164 */
5165
5166 if (sk->rmem_alloc >= sk->rcvbuf)
5167 {
5168 kfree_skb(skb, FREE_READ);
5169 release_sock(sk);
5170 return(0);
5171 }
5172
5173
5174 /*
5175 * Process urgent data
5176 */
5177
5178 if(tcp_urg(sk, th, saddr, len))
5179 {
5180 kfree_skb(skb, FREE_READ);
5181 release_sock(sk);
5182 return 0;
5183 }
5184
5185 /*
5186 * Process the encapsulated data
5187 */
5188
5189 if(tcp_data(skb,sk, saddr, len))
5190 {
5191 kfree_skb(skb, FREE_READ);
5192 release_sock(sk);
5193 return 0;
5194 }
5195
5196 /*
5197 * And done
5198 */
5199
5200 release_sock(sk);
5201 return 0;
5202 }
5203
5204 /*
5205 * This routine sends a packet with an out of date sequence
5206 * number. It assumes the other end will try to ack it.
5207 */
5208
5209 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5210 {
5211 struct sk_buff *buff,*skb;
5212 struct tcphdr *t1;
5213 struct device *dev=NULL;
5214 int tmp;
5215
5216 if (sk->zapped)
5217 return; /* After a valid reset we can send no more */
5218
5219 /*
5220 * Write data can still be transmitted/retransmitted in the
5221 * following states. If any other state is encountered, return.
5222 * [listen/close will never occur here anyway]
5223 */
5224
5225 if (sk->state != TCP_ESTABLISHED &&
5226 sk->state != TCP_CLOSE_WAIT &&
5227 sk->state != TCP_FIN_WAIT1 &&
5228 sk->state != TCP_LAST_ACK &&
5229 sk->state != TCP_CLOSING
5230 )
5231 {
5232 return;
5233 }
5234 if ( before(sk->sent_seq, sk->window_seq) &&
5235 (skb=skb_peek(&sk->write_queue)))
5236 {
5237 /*
5238 * We are probing the opening of a window
5239 * but the window size is != 0
5240 * must have been a result SWS advoidance ( sender )
5241 */
5242
5243 struct iphdr *iph;
5244 struct tcphdr *th;
5245 struct tcphdr *nth;
5246 unsigned long win_size;
5247 #if 0
5248 unsigned long ow_size;
5249 #endif
5250 void * tcp_data_start;
5251
5252 /*
5253 * How many bytes can we send ?
5254 */
5255
5256 win_size = sk->window_seq - sk->sent_seq;
5257
5258 /*
5259 * Recover the buffer pointers
5260 */
5261
5262 iph = (struct iphdr *)skb->ip_hdr;
5263 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5264
5265 /*
5266 * Grab the data for a temporary frame
5267 */
5268
5269 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5270 (iph->ihl << 2) +
5271 sk->prot->max_header + 15,
5272 1, GFP_ATOMIC);
5273 if ( buff == NULL )
5274 return;
5275
5276 /*
5277 * If we strip the packet on the write queue we must
5278 * be ready to retransmit this one
5279 */
5280
5281 buff->free = /*0*/1;
5282
5283 buff->sk = sk;
5284 buff->localroute = sk->localroute;
5285
5286 /*
5287 * Put headers on the new packet
5288 */
5289
5290 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5291 IPPROTO_TCP, sk->opt, buff->truesize,
5292 sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5293 if (tmp < 0)
5294 {
5295 sock_wfree(sk, buff);
5296 return;
5297 }
5298
5299 /*
5300 * Move the TCP header over
5301 */
5302
5303 buff->dev = dev;
5304
5305 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5306
5307 memcpy(nth, th, th->doff * 4);
5308
5309 /*
5310 * Correct the new header
5311 */
5312
5313 nth->ack = 1;
5314 nth->ack_seq = htonl(sk->acked_seq);
5315 nth->window = htons(tcp_select_window(sk));
5316 nth->check = 0;
5317
5318 /*
5319 * Find the first data byte.
5320 */
5321
5322 tcp_data_start = skb->data + skb->dev->hard_header_len +
5323 (iph->ihl << 2) + th->doff * 4;
5324
5325 /*
5326 * Add it to our new buffer
5327 */
5328 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5329
5330 /*
5331 * Remember our right edge sequence number.
5332 */
5333
5334 buff->end_seq = sk->sent_seq + win_size;
5335 sk->sent_seq = buff->end_seq; /* Hack */
5336 #if 0
5337
5338 /*
5339 * now: shrink the queue head segment
5340 */
5341
5342 th->check = 0;
5343 ow_size = skb->len - win_size -
5344 ((unsigned long) (tcp_data_start - (void *) skb->data));
5345
5346 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5347 skb_trim(skb,skb->len-win_size);
5348 sk->sent_seq += win_size;
5349 th->seq = htonl(sk->sent_seq);
5350 if (th->urg)
5351 {
5352 unsigned short urg_ptr;
5353
5354 urg_ptr = ntohs(th->urg_ptr);
5355 if (urg_ptr <= win_size)
5356 th->urg = 0;
5357 else
5358 {
5359 urg_ptr -= win_size;
5360 th->urg_ptr = htons(urg_ptr);
5361 nth->urg_ptr = htons(win_size);
5362 }
5363 }
5364 #else
5365 if(th->urg && ntohs(th->urg_ptr) < win_size)
5366 nth->urg = 0;
5367 #endif
5368
5369 /*
5370 * Checksum the split buffer
5371 */
5372
5373 tcp_send_check(nth, sk->saddr, sk->daddr,
5374 nth->doff * 4 + win_size , sk);
5375 }
5376 else
5377 {
5378 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5379 if (buff == NULL)
5380 return;
5381
5382 buff->free = 1;
5383 buff->sk = sk;
5384 buff->localroute = sk->localroute;
5385
5386 /*
5387 * Put in the IP header and routing stuff.
5388 */
5389
5390 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5391 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
5392 if (tmp < 0)
5393 {
5394 sock_wfree(sk, buff);
5395 return;
5396 }
5397
5398 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5399 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5400
5401 /*
5402 * Use a previous sequence.
5403 * This should cause the other end to send an ack.
5404 */
5405
5406 t1->seq = htonl(sk->sent_seq-1);
5407 t1->ack = 1;
5408 t1->res1= 0;
5409 t1->res2= 0;
5410 t1->rst = 0;
5411 t1->urg = 0;
5412 t1->psh = 0;
5413 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5414 t1->syn = 0;
5415 t1->ack_seq = htonl(sk->acked_seq);
5416 t1->window = htons(tcp_select_window(sk));
5417 t1->doff = sizeof(*t1)/4;
5418 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5419
5420 }
5421
5422 /*
5423 * Send it.
5424 */
5425
5426 sk->prot->queue_xmit(sk, dev, buff, 1);
5427 tcp_statistics.TcpOutSegs++;
5428 }
5429
5430 /*
5431 * A window probe timeout has occurred.
5432 */
5433
5434 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5435 {
5436 if (sk->zapped)
5437 return; /* After a valid reset we can send no more */
5438
5439 tcp_write_wakeup(sk);
5440
5441 sk->backoff++;
5442 sk->rto = min(sk->rto << 1, 120*HZ);
5443 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5444 sk->retransmits++;
5445 sk->prot->retransmits ++;
5446 }
5447
5448 /*
5449 * Socket option code for TCP.
5450 */
5451
5452 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5453 {
5454 int val,err;
5455
5456 if(level!=SOL_TCP)
5457 return ip_setsockopt(sk,level,optname,optval,optlen);
5458
5459 if (optval == NULL)
5460 return(-EINVAL);
5461
5462 err=verify_area(VERIFY_READ, optval, sizeof(int));
5463 if(err)
5464 return err;
5465
5466 val = get_user((int *)optval);
5467
5468 switch(optname)
5469 {
5470 case TCP_MAXSEG:
5471 /*
5472 * values greater than interface MTU won't take effect. however at
5473 * the point when this call is done we typically don't yet know
5474 * which interface is going to be used
5475 */
5476 if(val<1||val>MAX_WINDOW)
5477 return -EINVAL;
5478 sk->user_mss=val;
5479 return 0;
5480 case TCP_NODELAY:
5481 sk->nonagle=(val==0)?0:1;
5482 return 0;
5483 default:
5484 return(-ENOPROTOOPT);
5485 }
5486 }
5487
5488 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5489 {
5490 int val,err;
5491
5492 if(level!=SOL_TCP)
5493 return ip_getsockopt(sk,level,optname,optval,optlen);
5494
5495 switch(optname)
5496 {
5497 case TCP_MAXSEG:
5498 val=sk->user_mss;
5499 break;
5500 case TCP_NODELAY:
5501 val=sk->nonagle;
5502 break;
5503 default:
5504 return(-ENOPROTOOPT);
5505 }
5506 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5507 if(err)
5508 return err;
5509 put_user(sizeof(int),(int *) optlen);
5510
5511 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5512 if(err)
5513 return err;
5514 put_user(val,(int *)optval);
5515
5516 return(0);
5517 }
5518
5519
5520 struct proto tcp_prot = {
5521 tcp_close,
5522 ip_build_header,
5523 tcp_connect,
5524 tcp_accept,
5525 ip_queue_xmit,
5526 tcp_retransmit,
5527 tcp_write_wakeup,
5528 tcp_read_wakeup,
5529 tcp_rcv,
5530 tcp_select,
5531 tcp_ioctl,
5532 NULL,
5533 tcp_shutdown,
5534 tcp_setsockopt,
5535 tcp_getsockopt,
5536 tcp_sendmsg,
5537 tcp_recvmsg,
5538 NULL, /* No special bind() */
5539 128,
5540 0,
5541 "TCP",
5542 0, 0,
5543 {NULL,}
5544 };