1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. select
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle select() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), select() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in selecting before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : Select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if stat is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect()
180 *
181 *
182 * To Fix:
183 * Fast path the code. Two things here - fix the window calculation
184 * so it doesn't iterate over the queue, also spot packets with no funny
185 * options arriving in order and process directly.
186 *
187 * Implement RFC 1191 [Path MTU discovery]
188 * Look at the effect of implementing RFC 1337 suggestions and their impact.
189 * Rewrite output state machine to use a single queue and do low window
190 * situations as per the spec (RFC 1122)
191 * Speed up input assembly algorithm.
192 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
193 * could do with it working on IPv4
194 * User settable/learned rtt/max window/mtu
195 * Cope with MTU/device switches when retransmitting in tcp.
196 * Fix the window handling to use PR's new code.
197 *
198 * Change the fundamental structure to a single send queue maintained
199 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
200 * active routes too]). Cut the queue off in tcp_retransmit/
201 * tcp_transmit.
202 * Change the receive queue to assemble as it goes. This lets us
203 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
204 * tcp_data/tcp_read as well as the window shrink crud.
205 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
206 * tcp_queue_skb seem obvious routines to extract.
207 *
208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version.
212 *
213 * Description of States:
214 *
215 * TCP_SYN_SENT sent a connection request, waiting for ack
216 *
217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake.
219 *
220 * TCP_ESTABLISHED connection established
221 *
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data
224 *
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * to shutdown
227 *
228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending
230 *
231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore)
236 *
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK)
240 *
241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending
244 *
245 * TCP_CLOSE socket is finished
246 */
247
248 /*
249 * RFC1122 status:
250 * NOTE: I'm not going to be doing comments in the code for this one except
251 * for violations and the like. tcp.c is just too big... If I say something
252 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
253 * with Alan. -- MS 950903
254 *
255 * Use of PSH (4.2.2.2)
256 * MAY aggregate data sent without the PSH flag. (does)
257 * MAY queue data recieved without the PSH flag. (does)
258 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
259 * MAY implement PSH on send calls. (doesn't, thus:)
260 * MUST NOT buffer data indefinitely (doesn't [1 second])
261 * MUST set PSH on last segment (does)
262 * MAY pass received PSH to application layer (doesn't)
263 * SHOULD send maximum-sized segment whenever possible. (almost always does)
264 *
265 * Window Size (4.2.2.3, 4.2.2.16)
266 * MUST treat window size as an unsigned number (does)
267 * SHOULD treat window size as a 32-bit number (does not)
268 * MUST NOT shrink window once it is offered (does not normally)
269 *
270 * Urgent Pointer (4.2.2.4)
271 * **MUST point urgent pointer to last byte of urgent data (not right
272 * after). (doesn't, to be like BSD)
273 * MUST inform application layer asynchronously of incoming urgent
274 * data. (does)
275 * MUST provide application with means of determining the amount of
276 * urgent data pending. (does)
277 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
278 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
279 * [Follows BSD 1 byte of urgent data]
280 *
281 * TCP Options (4.2.2.5)
282 * MUST be able to recieve TCP options in any segment. (does)
283 * MUST ignore unsupported options (does)
284 *
285 * Maximum Segment Size Option (4.2.2.6)
286 * MUST implement both sending and receiving MSS. (does)
287 * SHOULD send an MSS with every SYN where recieve MSS != 536 (MAY send
288 * it always). (does, even when MSS == 536, which is legal)
289 * MUST assume MSS == 536 if no MSS received at connection setup (does)
290 * MUST calculate "effective send MSS" correctly:
291 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
292 * (does - but allows operator override)
293 *
294 * TCP Checksum (4.2.2.7)
295 * MUST generate and check TCP checksum. (does)
296 *
297 * Initial Sequence Number Selection (4.2.2.8)
298 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
299 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
300 * necessary for 10Mbps networks - and harder than BSD to spoof!)
301 *
302 * Simultaneous Open Attempts (4.2.2.10)
303 * MUST support simultaneous open attempts (does)
304 *
305 * Recovery from Old Duplicate SYN (4.2.2.11)
306 * MUST keep track of active vs. passive open (does)
307 *
308 * RST segment (4.2.2.12)
309 * SHOULD allow an RST segment to contain data (does, but doesn't do
310 * anything with it, which is standard)
311 *
312 * Closing a Connection (4.2.2.13)
313 * MUST inform application of whether connectin was closed by RST or
314 * normal close. (does)
315 * MAY allow "half-duplex" close (treat connection as closed for the
316 * local app, even before handshake is done). (does)
317 * MUST linger in TIME_WAIT for 2 * MSL (does)
318 *
319 * Retransmission Timeout (4.2.2.15)
320 * MUST implement Jacobson's slow start and congestion avoidance
321 * stuff. (does)
322 *
323 * Probing Zero Windows (4.2.2.17)
324 * MUST support probing of zero windows. (does)
325 * MAY keep offered window closed indefinitely. (does)
326 * MUST allow remote window to stay closed indefinitely. (does)
327 *
328 * Passive Open Calls (4.2.2.18)
329 * MUST NOT let new passive open affect other connections. (doesn't)
330 * MUST support passive opens (LISTENs) concurrently. (does)
331 *
332 * Time to Live (4.2.2.19)
333 * MUST make TCP TTL configurable. (does - IP_TTL option)
334 *
335 * Event Processing (4.2.2.20)
336 * SHOULD queue out-of-order segments. (does)
337 * MUST aggregate ACK segments whenever possible. (does but badly)
338 *
339 * Retransmission Timeout Calculation (4.2.3.1)
340 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
341 * calculation. (does, or at least explains them in the comments 8*b)
342 * SHOULD initialize RTO to 0 and RTT to 3. (does)
343 *
344 * When to Send an ACK Segment (4.2.3.2)
345 * SHOULD implement delayed ACK. (does not)
346 * MUST keep ACK delay < 0.5 sec. (N/A)
347 *
348 * When to Send a Window Update (4.2.3.3)
349 * MUST implement receiver-side SWS. (does)
350 *
351 * When to Send Data (4.2.3.4)
352 * MUST implement sender-side SWS. (does - imperfectly)
353 * SHOULD implement Nagle algorithm. (does)
354 *
355 * TCP Connection Failures (4.2.3.5)
356 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
357 * SHOULD inform application layer of soft errors. (doesn't)
358 *
359 * TCP Keep-Alives (4.2.3.6)
360 * MAY provide keep-alives. (does)
361 * MUST make keep-alives configurable on a per-connection basis. (does)
362 * MUST default to no keep-alives. (does)
363 * **MUST make keep-alive interval configurable. (doesn't)
364 * **MUST make default keep-alive interval > 2 hours. (doesn't)
365 * MUST NOT interpret failure to ACK keep-alive packet as dead
366 * connection. (doesn't)
367 * SHOULD send keep-alive with no data. (does)
368 *
369 * TCP Multihoming (4.2.3.7)
370 * MUST get source address from IP layer before sending first
371 * SYN. (does)
372 * MUST use same local address for all segments of a connection. (does)
373 *
374 * IP Options (4.2.3.8)
375 * (I don't think the IP layer sees the IP options, yet.)
376 * MUST ignore unsupported IP options. (does, I guess 8*b)
377 * MAY support Time Stamp and Record Route. (doesn't)
378 * **MUST allow application to specify a source route. (doesn't?)
379 * **MUST allow receieved Source Route option to set route for all future
380 * segments on this connection. (doesn't, not that I think it's a
381 * huge problem)
382 *
383 * ICMP messages (4.2.3.9)
384 * MUST act on ICMP errors. (does)
385 * MUST slow transmission upon receipt of a Source Quench. (does)
386 * MUST NOT abort connection upon receipt of soft Destination
387 * Unreachables (0, 1, 5), Time Exceededs and Parameter
388 * Problems. (doesn't)
389 * SHOULD report soft Destination Unreachables etc. to the
390 * application. (doesn't)
391 * SHOULD abort connection upon receipt of hard Destination Unreachable
392 * messages (2, 3, 4). (does)
393 *
394 * Remote Address Validation (4.2.3.10)
395 * MUST reject as an error OPEN for invalid remote IP address. (does)
396 * MUST ignore SYN with invalid source address. (does)
397 * MUST silently discard incoming SYN for broadcast/multicast
398 * address. (does)
399 *
400 * Asynchronous Reports (4.2.4.1)
401 * **MUST provide mechanism for reporting soft errors to application
402 * layer. (doesn't)
403 *
404 * Type of Service (4.2.4.2)
405 * MUST allow application layer to set Type of Service. (does IP_TOS)
406 *
407 * (Whew. -- MS 950903)
408 **/
409
410 #include <linux/types.h>
411 #include <linux/sched.h>
412 #include <linux/mm.h>
413 #include <linux/time.h>
414 #include <linux/string.h>
415 #include <linux/config.h>
416 #include <linux/socket.h>
417 #include <linux/sockios.h>
418 #include <linux/termios.h>
419 #include <linux/in.h>
420 #include <linux/fcntl.h>
421 #include <linux/inet.h>
422 #include <linux/netdevice.h>
423 #include <net/snmp.h>
424 #include <net/ip.h>
425 #include <net/protocol.h>
426 #include <net/icmp.h>
427 #include <net/tcp.h>
428 #include <net/arp.h>
429 #include <linux/skbuff.h>
430 #include <net/sock.h>
431 #include <net/route.h>
432 #include <linux/errno.h>
433 #include <linux/timer.h>
434 #include <asm/system.h>
435 #include <asm/segment.h>
436 #include <linux/mm.h>
437 #include <net/checksum.h>
438
439 /*
440 * The MSL timer is the 'normal' timer.
441 */
442
443 #define reset_msl_timer(x,y,z) reset_timer(x,y,z)
444
445 #define SEQ_TICK 3
446 unsigned long seq_offset;
447 struct tcp_mib tcp_statistics;
448
449 /*
450 * Cached last hit socket
451 */
452
453 volatile unsigned long th_cache_saddr,th_cache_daddr;
454 volatile unsigned short th_cache_dport, th_cache_sport;
455 volatile struct sock *th_cache_sk;
456
457 void tcp_cache_zap(void)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
458 {
459 unsigned long flags;
460 save_flags(flags);
461 cli();
462 th_cache_saddr=0;
463 th_cache_daddr=0;
464 th_cache_dport=0;
465 th_cache_sport=0;
466 th_cache_sk=NULL;
467 restore_flags(flags);
468 }
469
470 static void tcp_close(struct sock *sk, int timeout);
471
472
473 /*
474 * The less said about this the better, but it works and will do for 1.2
475 */
476
477 static struct wait_queue *master_select_wakeup;
478
479 static __inline__ int min(unsigned int a, unsigned int b)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
480 {
481 if (a < b)
482 return(a);
483 return(b);
484 }
485
486 #undef STATE_TRACE
487
488 #ifdef STATE_TRACE
489 static char *statename[]={
490 "Unused","Established","Syn Sent","Syn Recv",
491 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
492 "Close Wait","Last ACK","Listen","Closing"
493 };
494 #endif
495
496 static __inline__ void tcp_set_state(struct sock *sk, int state)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
497 {
498 if(sk->state==TCP_ESTABLISHED)
499 tcp_statistics.TcpCurrEstab--;
500 #ifdef STATE_TRACE
501 if(sk->debug)
502 printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]);
503 #endif
504 /* This is a hack but it doesn't occur often and it's going to
505 be a real to fix nicely */
506
507 if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV)
508 {
509 wake_up_interruptible(&master_select_wakeup);
510 }
511 sk->state=state;
512 if(state==TCP_ESTABLISHED)
513 tcp_statistics.TcpCurrEstab++;
514 if(sk->state==TCP_CLOSE)
515 tcp_cache_zap();
516 }
517
518 /*
519 * This routine picks a TCP windows for a socket based on
520 * the following constraints
521 *
522 * 1. The window can never be shrunk once it is offered (RFC 793)
523 * 2. We limit memory per socket
524 *
525 * For now we use NET2E3's heuristic of offering half the memory
526 * we have handy. All is not as bad as this seems however because
527 * of two things. Firstly we will bin packets even within the window
528 * in order to get the data we are waiting for into the memory limit.
529 * Secondly we bin common duplicate forms at receive time
530 * Better heuristics welcome
531 */
532
533 int tcp_select_window(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
534 {
535 int new_window = sock_rspace(sk);
536
537 if(sk->window_clamp)
538 new_window=min(sk->window_clamp,new_window);
539 /*
540 * Two things are going on here. First, we don't ever offer a
541 * window less than min(sk->mss, MAX_WINDOW/2). This is the
542 * receiver side of SWS as specified in RFC1122.
543 * Second, we always give them at least the window they
544 * had before, in order to avoid retracting window. This
545 * is technically allowed, but RFC1122 advises against it and
546 * in practice it causes trouble.
547 *
548 * Fixme: This doesn't correctly handle the case where
549 * new_window > sk->window but not by enough to allow for the
550 * shift in sequence space.
551 */
552 if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window)
553 return(sk->window);
554 return(new_window);
555 }
556
557 /*
558 * Find someone to 'accept'. Must be called with
559 * sk->inuse=1 or cli()
560 */
561
562 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
563 {
564 struct sk_buff *p=skb_peek(&s->receive_queue);
565 if(p==NULL)
566 return NULL;
567 do
568 {
569 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
570 return p;
571 p=p->next;
572 }
573 while(p!=(struct sk_buff *)&s->receive_queue);
574 return NULL;
575 }
576
577 /*
578 * Remove a completed connection and return it. This is used by
579 * tcp_accept() to get connections from the queue.
580 */
581
582 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
583 {
584 struct sk_buff *skb;
585 unsigned long flags;
586 save_flags(flags);
587 cli();
588 skb=tcp_find_established(s);
589 if(skb!=NULL)
590 skb_unlink(skb); /* Take it off the queue */
591 restore_flags(flags);
592 return skb;
593 }
594
595 /*
596 * This routine closes sockets which have been at least partially
597 * opened, but not yet accepted. Currently it is only called by
598 * tcp_close, and timeout mirrors the value there.
599 */
600
601 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
602 {
603 struct sk_buff *skb;
604
605 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
606 {
607 skb->sk->dead=1;
608 tcp_close(skb->sk, 0);
609 kfree_skb(skb, FREE_READ);
610 }
611 return;
612 }
613
614 /*
615 * Enter the time wait state.
616 */
617
618 static void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
619 {
620 tcp_set_state(sk,TCP_TIME_WAIT);
621 sk->shutdown = SHUTDOWN_MASK;
622 if (!sk->dead)
623 sk->state_change(sk);
624 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
625 }
626
627 /*
628 * A socket has timed out on its send queue and wants to do a
629 * little retransmitting. Currently this means TCP.
630 */
631
632 void tcp_do_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
633 {
634 struct sk_buff * skb;
635 struct proto *prot;
636 struct device *dev;
637 int ct=0;
638 struct rtable *rt;
639
640 prot = sk->prot;
641 skb = sk->send_head;
642
643 while (skb != NULL)
644 {
645 struct tcphdr *th;
646 struct iphdr *iph;
647 int size;
648
649 dev = skb->dev;
650 IS_SKB(skb);
651 skb->when = jiffies;
652
653 /*
654 * Discard the surplus MAC header
655 */
656
657 skb_pull(skb,((unsigned char *)skb->ip_hdr)-skb->data);
658
659 /*
660 * In general it's OK just to use the old packet. However we
661 * need to use the current ack and window fields. Urg and
662 * urg_ptr could possibly stand to be updated as well, but we
663 * don't keep the necessary data. That shouldn't be a problem,
664 * if the other end is doing the right thing. Since we're
665 * changing the packet, we have to issue a new IP identifier.
666 */
667
668 iph = (struct iphdr *)skb->data;
669 th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2));
670 size = ntohs(iph->tot_len) - (iph->ihl<<2);
671
672 /*
673 * Note: We ought to check for window limits here but
674 * currently this is done (less efficiently) elsewhere.
675 */
676
677 iph->id = htons(ip_id_count++);
678 ip_send_check(iph);
679
680 /*
681 * Put a MAC header back on (may cause ARPing)
682 */
683
684 if(skb->localroute)
685 rt=ip_rt_local(iph->daddr,NULL,NULL);
686 else
687 rt=ip_rt_route(iph->daddr,NULL,NULL);
688
689 if(rt==NULL) /* Deep poo */
690 {
691 if(skb->sk)
692 {
693 skb->sk->err=ENETUNREACH;
694 skb->sk->error_report(skb->sk);
695 }
696 }
697 else
698 {
699 dev=rt->rt_dev;
700 skb->raddr=rt->rt_gateway;
701 if(skb->raddr==0)
702 skb->raddr=iph->daddr;
703 skb->dev=dev;
704 skb->arp=1;
705 if(dev->hard_header)
706 {
707 if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, skb->len)<0)
708 skb->arp=0;
709 }
710
711 /*
712 * This is not the right way to handle this. We have to
713 * issue an up to date window and ack report with this
714 * retransmit to keep the odd buggy tcp that relies on
715 * the fact BSD does this happy.
716 * We don't however need to recalculate the entire
717 * checksum, so someone wanting a small problem to play
718 * with might like to implement RFC1141/RFC1624 and speed
719 * this up by avoiding a full checksum.
720 */
721
722 th->ack_seq = ntohl(sk->acked_seq);
723 th->window = ntohs(tcp_select_window(sk));
724 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
725
726 /*
727 * If the interface is (still) up and running, kick it.
728 */
729
730 if (dev->flags & IFF_UP)
731 {
732 /*
733 * If the packet is still being sent by the device/protocol
734 * below then don't retransmit. This is both needed, and good -
735 * especially with connected mode AX.25 where it stops resends
736 * occurring of an as yet unsent anyway frame!
737 * We still add up the counts as the round trip time wants
738 * adjusting.
739 */
740 if (sk && !skb_device_locked(skb))
741 {
742 /* Remove it from any existing driver queue first! */
743 skb_unlink(skb);
744 /* Now queue it */
745 ip_statistics.IpOutRequests++;
746 dev_queue_xmit(skb, dev, sk->priority);
747 }
748 }
749 }
750
751 /*
752 * Count retransmissions
753 */
754
755 ct++;
756 sk->prot->retransmits ++;
757 tcp_statistics.TcpRetransSegs++;
758
759
760 /*
761 * Only one retransmit requested.
762 */
763
764 if (!all)
765 break;
766
767 /*
768 * This should cut it off before we send too many packets.
769 */
770
771 if (ct >= sk->cong_window)
772 break;
773 skb = skb->link3;
774 }
775 }
776
777 /*
778 * Reset the retransmission timer
779 */
780
781 static void reset_xmit_timer(struct sock *sk, int why, unsigned long when)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
782 {
783 del_timer(&sk->retransmit_timer);
784 sk->ip_xmit_timeout = why;
785 if((int)when < 0)
786 {
787 when=3;
788 printk("Error: Negative timer in xmit_timer\n");
789 }
790 sk->retransmit_timer.expires=jiffies+when;
791 add_timer(&sk->retransmit_timer);
792 }
793
794 /*
795 * This is the normal code called for timeouts. It does the retransmission
796 * and then does backoff. tcp_do_retransmit is separated out because
797 * tcp_ack needs to send stuff from the retransmit queue without
798 * initiating a backoff.
799 */
800
801
802 void tcp_retransmit_time(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
803 {
804 tcp_do_retransmit(sk, all);
805
806 /*
807 * Increase the timeout each time we retransmit. Note that
808 * we do not increase the rtt estimate. rto is initialized
809 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
810 * that doubling rto each time is the least we can get away with.
811 * In KA9Q, Karn uses this for the first few times, and then
812 * goes to quadratic. netBSD doubles, but only goes up to *64,
813 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
814 * defined in the protocol as the maximum possible RTT. I guess
815 * we'll have to use something other than TCP to talk to the
816 * University of Mars.
817 *
818 * PAWS allows us longer timeouts and large windows, so once
819 * implemented ftp to mars will work nicely. We will have to fix
820 * the 120 second clamps though!
821 */
822
823 sk->retransmits++;
824 sk->prot->retransmits++;
825 sk->backoff++;
826 sk->rto = min(sk->rto << 1, 120*HZ);
827 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
828 }
829
830
831 /*
832 * A timer event has trigger a tcp retransmit timeout. The
833 * socket xmit queue is ready and set up to send. Because
834 * the ack receive code keeps the queue straight we do
835 * nothing clever here.
836 */
837
838 static void tcp_retransmit(struct sock *sk, int all)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
839 {
840 if (all)
841 {
842 tcp_retransmit_time(sk, all);
843 return;
844 }
845
846 sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
847 /* sk->ssthresh in theory can be zero. I guess that's OK */
848 sk->cong_count = 0;
849
850 sk->cong_window = 1;
851
852 /* Do the actual retransmit. */
853 tcp_retransmit_time(sk, all);
854 }
855
856 /*
857 * A write timeout has occurred. Process the after effects.
858 */
859
860 static int tcp_write_timeout(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
861 {
862 /*
863 * Look for a 'soft' timeout.
864 */
865 if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
866 || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1))
867 {
868 /*
869 * Attempt to recover if arp has changed (unlikely!) or
870 * a route has shifted (not supported prior to 1.3).
871 */
872 arp_destroy (sk->daddr, 0);
873 /*ip_route_check (sk->daddr);*/
874 }
875
876 /*
877 * Have we tried to SYN too many times (repent repent 8))
878 */
879
880 if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
881 {
882 sk->err=ETIMEDOUT;
883 sk->error_report(sk);
884 del_timer(&sk->retransmit_timer);
885 tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */
886 tcp_set_state(sk,TCP_CLOSE);
887 /* Don't FIN, we got nothing back */
888 release_sock(sk);
889 return 0;
890 }
891 /*
892 * Has it gone just too far ?
893 */
894 if (sk->retransmits > TCP_RETR2)
895 {
896 sk->err = ETIMEDOUT;
897 sk->error_report(sk);
898 del_timer(&sk->retransmit_timer);
899 /*
900 * Time wait the socket
901 */
902 if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING )
903 {
904 tcp_set_state(sk,TCP_TIME_WAIT);
905 reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
906 }
907 else
908 {
909 /*
910 * Clean up time.
911 */
912 tcp_set_state(sk, TCP_CLOSE);
913 release_sock(sk);
914 return 0;
915 }
916 }
917 return 1;
918 }
919
920 /*
921 * The TCP retransmit timer. This lacks a few small details.
922 *
923 * 1. An initial rtt timeout on the probe0 should cause what we can
924 * of the first write queue buffer to be split and sent.
925 * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
926 * ETIMEDOUT if we know an additional 'soft' error caused this.
927 * tcp_err should save a 'soft error' for us.
928 */
929
930 static void retransmit_timer(unsigned long data)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
931 {
932 struct sock *sk = (struct sock*)data;
933 int why = sk->ip_xmit_timeout;
934
935 /*
936 * only process if socket is not in use
937 */
938
939 cli();
940 if (sk->inuse || in_bh)
941 {
942 /* Try again in 1 second */
943 sk->retransmit_timer.expires = jiffies+HZ;
944 add_timer(&sk->retransmit_timer);
945 sti();
946 return;
947 }
948
949 sk->inuse = 1;
950 sti();
951
952 /* Always see if we need to send an ack. */
953
954 if (sk->ack_backlog && !sk->zapped)
955 {
956 sk->prot->read_wakeup (sk);
957 if (! sk->dead)
958 sk->data_ready(sk,0);
959 }
960
961 /* Now we need to figure out why the socket was on the timer. */
962
963 switch (why)
964 {
965 /* Window probing */
966 case TIME_PROBE0:
967 tcp_send_probe0(sk);
968 tcp_write_timeout(sk);
969 break;
970 /* Retransmitting */
971 case TIME_WRITE:
972 /* It could be we got here because we needed to send an ack.
973 * So we need to check for that.
974 */
975 {
976 struct sk_buff *skb;
977 unsigned long flags;
978
979 save_flags(flags);
980 cli();
981 skb = sk->send_head;
982 if (!skb)
983 {
984 restore_flags(flags);
985 }
986 else
987 {
988 /*
989 * Kicked by a delayed ack. Reset timer
990 * correctly now
991 */
992 if (jiffies < skb->when + sk->rto)
993 {
994 reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies);
995 restore_flags(flags);
996 break;
997 }
998 restore_flags(flags);
999 /*
1000 * Retransmission
1001 */
1002 sk->retransmits++;
1003 sk->prot->retransmits++;
1004 sk->prot->retransmit (sk, 0);
1005 tcp_write_timeout(sk);
1006 }
1007 break;
1008 }
1009 /* Sending Keepalives */
1010 case TIME_KEEPOPEN:
1011 /*
1012 * this reset_timer() call is a hack, this is not
1013 * how KEEPOPEN is supposed to work.
1014 */
1015 reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
1016
1017 /* Send something to keep the connection open. */
1018 if (sk->prot->write_wakeup)
1019 sk->prot->write_wakeup (sk);
1020 sk->retransmits++;
1021 sk->prot->retransmits++;
1022 tcp_write_timeout(sk);
1023 break;
1024 default:
1025 printk ("rexmit_timer: timer expired - reason unknown\n");
1026 break;
1027 }
1028 release_sock(sk);
1029 }
1030
1031 /*
1032 * This routine is called by the ICMP module when it gets some
1033 * sort of error condition. If err < 0 then the socket should
1034 * be closed and the error returned to the user. If err > 0
1035 * it's just the icmp type << 8 | icmp code. After adjustment
1036 * header points to the first 8 bytes of the tcp header. We need
1037 * to find the appropriate port.
1038 */
1039
1040 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1041 __u32 saddr, struct inet_protocol *protocol)
1042 {
1043 struct tcphdr *th;
1044 struct sock *sk;
1045 struct iphdr *iph=(struct iphdr *)header;
1046
1047 header+=4*iph->ihl;
1048
1049
1050 th =(struct tcphdr *)header;
1051 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
1052
1053 if (sk == NULL)
1054 return;
1055
1056 if (type == ICMP_SOURCE_QUENCH)
1057 {
1058 /*
1059 * FIXME:
1060 * For now we will just trigger a linear backoff.
1061 * The slow start code should cause a real backoff here.
1062 */
1063 if (sk->cong_window > 4)
1064 sk->cong_window--;
1065 return;
1066 }
1067
1068 if (type == ICMP_PARAMETERPROB)
1069 {
1070 sk->err=EPROTO;
1071 sk->error_report(sk);
1072 }
1073
1074 /*
1075 * If we've already connected we will keep trying
1076 * until we time out, or the user gives up.
1077 */
1078
1079 if (code < 13 && (icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV))
1080 {
1081 sk->err = icmp_err_convert[code].errno;
1082 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1083 {
1084 tcp_statistics.TcpAttemptFails++;
1085 tcp_set_state(sk,TCP_CLOSE);
1086 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
1087 }
1088 }
1089 return;
1090 }
1091
1092
1093 /*
1094 * Walk down the receive queue counting readable data until we hit the end or we find a gap
1095 * in the received data queue (ie a frame missing that needs sending to us). Not
1096 * sorting using two queues as data arrives makes life so much harder.
1097 */
1098
1099 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1100 {
1101 unsigned long counted;
1102 unsigned long amount;
1103 struct sk_buff *skb;
1104 int sum;
1105 unsigned long flags;
1106
1107 if(sk && sk->debug)
1108 printk("tcp_readable: %p - ",sk);
1109
1110 save_flags(flags);
1111 cli();
1112 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
1113 {
1114 restore_flags(flags);
1115 if(sk && sk->debug)
1116 printk("empty\n");
1117 return(0);
1118 }
1119
1120 counted = sk->copied_seq; /* Where we are at the moment */
1121 amount = 0;
1122
1123 /*
1124 * Do until a push or until we are out of data.
1125 */
1126
1127 do
1128 {
1129 if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */
1130 break;
1131 sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */
1132 if (skb->h.th->syn)
1133 sum++;
1134 if (sum > 0)
1135 { /* Add it up, move on */
1136 amount += sum;
1137 if (skb->h.th->syn)
1138 amount--;
1139 counted += sum;
1140 }
1141 /*
1142 * Don't count urg data ... but do it in the right place!
1143 * Consider: "old_data (ptr is here) URG PUSH data"
1144 * The old code would stop at the first push because
1145 * it counted the urg (amount==1) and then does amount--
1146 * *after* the loop. This means tcp_readable() always
1147 * returned zero if any URG PUSH was in the queue, even
1148 * though there was normal data available. If we subtract
1149 * the urg data right here, we even get it to work for more
1150 * than one URG PUSH skb without normal data.
1151 * This means that select() finally works now with urg data
1152 * in the queue. Note that rlogin was never affected
1153 * because it doesn't use select(); it uses two processes
1154 * and a blocking read(). And the queue scan in tcp_read()
1155 * was correct. Mike <pall@rz.uni-karlsruhe.de>
1156 */
1157 if (skb->h.th->urg)
1158 amount--; /* don't count urg data */
1159 if (amount && skb->h.th->psh) break;
1160 skb = skb->next;
1161 }
1162 while(skb != (struct sk_buff *)&sk->receive_queue);
1163
1164 restore_flags(flags);
1165 if(sk->debug)
1166 printk("got %lu bytes.\n",amount);
1167 return(amount);
1168 }
1169
1170 /*
1171 * LISTEN is a special case for select..
1172 */
1173 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1174 {
1175 if (sel_type == SEL_IN) {
1176 int retval;
1177
1178 sk->inuse = 1;
1179 retval = (tcp_find_established(sk) != NULL);
1180 release_sock(sk);
1181 if (!retval)
1182 select_wait(&master_select_wakeup,wait);
1183 return retval;
1184 }
1185 return 0;
1186 }
1187
1188
1189 /*
1190 * Wait for a TCP event.
1191 *
1192 * Note that we don't need to set "sk->inuse", as the upper select layers
1193 * take care of normal races (between the test and the event) and we don't
1194 * go look at any of the socket buffers directly.
1195 */
1196 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1197 {
1198 if (sk->state == TCP_LISTEN)
1199 return tcp_listen_select(sk, sel_type, wait);
1200
1201 switch(sel_type) {
1202 case SEL_IN:
1203 if (sk->err)
1204 return 1;
1205 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1206 break;
1207
1208 if (sk->shutdown & RCV_SHUTDOWN)
1209 return 1;
1210
1211 if (sk->acked_seq == sk->copied_seq)
1212 break;
1213
1214 if (sk->urg_seq != sk->copied_seq ||
1215 sk->acked_seq != sk->copied_seq+1 ||
1216 sk->urginline || !sk->urg_data)
1217 return 1;
1218 break;
1219
1220 case SEL_OUT:
1221 if (sk->err)
1222 return 1;
1223 if (sk->shutdown & SEND_SHUTDOWN)
1224 return 0;
1225 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
1226 break;
1227 /*
1228 * This is now right thanks to a small fix
1229 * by Matt Dillon.
1230 */
1231
1232 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
1233 break;
1234 return 1;
1235
1236 case SEL_EX:
1237 if (sk->urg_data)
1238 return 1;
1239 break;
1240 }
1241 select_wait(sk->sleep, wait);
1242 return 0;
1243 }
1244
1245 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1246 {
1247 int err;
1248 switch(cmd)
1249 {
1250
1251 case TIOCINQ:
1252 #ifdef FIXME /* FIXME: */
1253 case FIONREAD:
1254 #endif
1255 {
1256 unsigned long amount;
1257
1258 if (sk->state == TCP_LISTEN)
1259 return(-EINVAL);
1260
1261 sk->inuse = 1;
1262 amount = tcp_readable(sk);
1263 release_sock(sk);
1264 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1265 if(err)
1266 return err;
1267 put_user(amount, (int *)arg);
1268 return(0);
1269 }
1270 case SIOCATMARK:
1271 {
1272 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
1273
1274 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
1275 if (err)
1276 return err;
1277 put_user(answ,(int *) arg);
1278 return(0);
1279 }
1280 case TIOCOUTQ:
1281 {
1282 unsigned long amount;
1283
1284 if (sk->state == TCP_LISTEN) return(-EINVAL);
1285 amount = sock_wspace(sk);
1286 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
1287 if(err)
1288 return err;
1289 put_user(amount, (int *)arg);
1290 return(0);
1291 }
1292 default:
1293 return(-EINVAL);
1294 }
1295 }
1296
1297
1298 /*
1299 * This routine computes a TCP checksum.
1300 *
1301 * Modified January 1995 from a go-faster DOS routine by
1302 * Jorge Cwik <jorge@laser.satlink.net>
1303 */
1304
1305 unsigned short tcp_check(struct tcphdr *th, int len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1306 unsigned long saddr, unsigned long daddr, unsigned long base)
1307 {
1308 return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
1309 }
1310
1311
1312
1313 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1314 unsigned long daddr, int len, struct sock *sk)
1315 {
1316 th->check = 0;
1317 th->check = tcp_check(th, len, saddr, daddr,
1318 csum_partial((char *)th,len,0));
1319 return;
1320 }
1321
1322 /*
1323 * This is the main buffer sending routine. We queue the buffer
1324 * having checked it is sane seeming.
1325 */
1326
1327 static void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1328 {
1329 int size;
1330 struct tcphdr * th = skb->h.th;
1331
1332 /*
1333 * length of packet (not counting length of pre-tcp headers)
1334 */
1335
1336 size = skb->len - ((unsigned char *) th - skb->data);
1337
1338 /*
1339 * Sanity check it..
1340 */
1341
1342 if (size < sizeof(struct tcphdr) || size > skb->len)
1343 {
1344 printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n",
1345 skb, skb->data, th, skb->len);
1346 kfree_skb(skb, FREE_WRITE);
1347 return;
1348 }
1349
1350 /*
1351 * If we have queued a header size packet.. (these crash a few
1352 * tcp stacks if ack is not set)
1353 */
1354
1355 if (size == sizeof(struct tcphdr))
1356 {
1357 /* If it's got a syn or fin it's notionally included in the size..*/
1358 if(!th->syn && !th->fin)
1359 {
1360 printk("tcp_send_skb: attempt to queue a bogon.\n");
1361 kfree_skb(skb,FREE_WRITE);
1362 return;
1363 }
1364 }
1365
1366 /*
1367 * Actual processing.
1368 */
1369
1370 tcp_statistics.TcpOutSegs++;
1371 skb->h.seq = ntohl(th->seq) + size - 4*th->doff;
1372
1373 /*
1374 * We must queue if
1375 *
1376 * a) The right edge of this frame exceeds the window
1377 * b) We are retransmitting (Nagle's rule)
1378 * c) We have too many packets 'in flight'
1379 */
1380
1381 if (after(skb->h.seq, sk->window_seq) ||
1382 (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) ||
1383 sk->packets_out >= sk->cong_window)
1384 {
1385 /* checksum will be supplied by tcp_write_xmit. So
1386 * we shouldn't need to set it at all. I'm being paranoid */
1387 th->check = 0;
1388 if (skb->next != NULL)
1389 {
1390 printk("tcp_send_partial: next != NULL\n");
1391 skb_unlink(skb);
1392 }
1393 skb_queue_tail(&sk->write_queue, skb);
1394
1395 /*
1396 * If we don't fit we have to start the zero window
1397 * probes. This is broken - we really need to do a partial
1398 * send _first_ (This is what causes the Cisco and PC/TCP
1399 * grief).
1400 */
1401
1402 if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
1403 sk->send_head == NULL && sk->ack_backlog == 0)
1404 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
1405 }
1406 else
1407 {
1408 /*
1409 * This is going straight out
1410 */
1411
1412 th->ack_seq = ntohl(sk->acked_seq);
1413 th->window = ntohs(tcp_select_window(sk));
1414
1415 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
1416
1417 sk->sent_seq = sk->write_seq;
1418
1419 /*
1420 * This is mad. The tcp retransmit queue is put together
1421 * by the ip layer. This causes half the problems with
1422 * unroutable FIN's and other things.
1423 */
1424
1425 sk->prot->queue_xmit(sk, skb->dev, skb, 0);
1426
1427 /*
1428 * Set for next retransmit based on expected ACK time.
1429 * FIXME: We set this every time which means our
1430 * retransmits are really about a window behind.
1431 */
1432
1433 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
1434 }
1435 }
1436
1437 /*
1438 * Locking problems lead us to a messy situation where we can have
1439 * multiple partially complete buffers queued up. This is really bad
1440 * as we don't want to be sending partial buffers. Fix this with
1441 * a semaphore or similar to lock tcp_write per socket.
1442 *
1443 * These routines are pretty self descriptive.
1444 */
1445
1446 struct sk_buff * tcp_dequeue_partial(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1447 {
1448 struct sk_buff * skb;
1449 unsigned long flags;
1450
1451 save_flags(flags);
1452 cli();
1453 skb = sk->partial;
1454 if (skb) {
1455 sk->partial = NULL;
1456 del_timer(&sk->partial_timer);
1457 }
1458 restore_flags(flags);
1459 return skb;
1460 }
1461
1462 /*
1463 * Empty the partial queue
1464 */
1465
1466 static void tcp_send_partial(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1467 {
1468 struct sk_buff *skb;
1469
1470 if (sk == NULL)
1471 return;
1472 while ((skb = tcp_dequeue_partial(sk)) != NULL)
1473 tcp_send_skb(sk, skb);
1474 }
1475
1476 /*
1477 * Queue a partial frame
1478 */
1479
1480 void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1481 {
1482 struct sk_buff * tmp;
1483 unsigned long flags;
1484
1485 save_flags(flags);
1486 cli();
1487 tmp = sk->partial;
1488 if (tmp)
1489 del_timer(&sk->partial_timer);
1490 sk->partial = skb;
1491 init_timer(&sk->partial_timer);
1492 /*
1493 * Wait up to 1 second for the buffer to fill.
1494 */
1495 sk->partial_timer.expires = jiffies+HZ;
1496 sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial;
1497 sk->partial_timer.data = (unsigned long) sk;
1498 add_timer(&sk->partial_timer);
1499 restore_flags(flags);
1500 if (tmp)
1501 tcp_send_skb(sk, tmp);
1502 }
1503
1504
1505 /*
1506 * This routine sends an ack and also updates the window.
1507 */
1508
1509 static void tcp_send_ack(u32 sequence, u32 ack,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1510 struct sock *sk,
1511 struct tcphdr *th, unsigned long daddr)
1512 {
1513 struct sk_buff *buff;
1514 struct tcphdr *t1;
1515 struct device *dev = NULL;
1516 int tmp;
1517
1518 if(sk->zapped)
1519 return; /* We have been reset, we may not send again */
1520
1521 /*
1522 * We need to grab some memory, and put together an ack,
1523 * and then put it into the queue to be sent.
1524 */
1525
1526 buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC);
1527 if (buff == NULL)
1528 {
1529 /*
1530 * Force it to send an ack. We don't have to do this
1531 * (ACK is unreliable) but it's much better use of
1532 * bandwidth on slow links to send a spare ack than
1533 * resend packets.
1534 */
1535
1536 sk->ack_backlog++;
1537 if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state))
1538 {
1539 reset_xmit_timer(sk, TIME_WRITE, HZ);
1540 }
1541 return;
1542 }
1543
1544 /*
1545 * Assemble a suitable TCP frame
1546 */
1547
1548 buff->sk = sk;
1549 buff->localroute = sk->localroute;
1550
1551 /*
1552 * Put in the IP header and routing stuff.
1553 */
1554
1555 tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev,
1556 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
1557 if (tmp < 0)
1558 {
1559 buff->free = 1;
1560 sock_wfree(sk, buff);
1561 return;
1562 }
1563 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1564
1565 memcpy(t1, th, sizeof(*t1));
1566
1567 /*
1568 * Swap the send and the receive.
1569 */
1570
1571 t1->dest = th->source;
1572 t1->source = th->dest;
1573 t1->seq = ntohl(sequence);
1574 t1->ack = 1;
1575 sk->window = tcp_select_window(sk);
1576 t1->window = ntohs(sk->window);
1577 t1->res1 = 0;
1578 t1->res2 = 0;
1579 t1->rst = 0;
1580 t1->urg = 0;
1581 t1->syn = 0;
1582 t1->psh = 0;
1583 t1->fin = 0;
1584
1585 /*
1586 * If we have nothing queued for transmit and the transmit timer
1587 * is on we are just doing an ACK timeout and need to switch
1588 * to a keepalive.
1589 */
1590
1591 if (ack == sk->acked_seq)
1592 {
1593 sk->ack_backlog = 0;
1594 sk->bytes_rcv = 0;
1595 sk->ack_timed = 0;
1596 if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL
1597 && sk->ip_xmit_timeout == TIME_WRITE)
1598 {
1599 if(sk->keepopen) {
1600 reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN);
1601 } else {
1602 delete_timer(sk);
1603 }
1604 }
1605 }
1606
1607 /*
1608 * Fill in the packet and send it
1609 */
1610
1611 t1->ack_seq = ntohl(ack);
1612 t1->doff = sizeof(*t1)/4;
1613 tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk);
1614 if (sk->debug)
1615 printk("\rtcp_ack: seq %x ack %x\n", sequence, ack);
1616 tcp_statistics.TcpOutSegs++;
1617 sk->prot->queue_xmit(sk, dev, buff, 1);
1618 }
1619
1620
1621 /*
1622 * This routine builds a generic TCP header.
1623 */
1624
1625 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1626 {
1627
1628 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
1629 th->seq = htonl(sk->write_seq);
1630 th->psh =(push == 0) ? 1 : 0;
1631 th->doff = sizeof(*th)/4;
1632 th->ack = 1;
1633 th->fin = 0;
1634 sk->ack_backlog = 0;
1635 sk->bytes_rcv = 0;
1636 sk->ack_timed = 0;
1637 th->ack_seq = htonl(sk->acked_seq);
1638 sk->window = tcp_select_window(sk);
1639 th->window = htons(sk->window);
1640
1641 return(sizeof(*th));
1642 }
1643
1644 /*
1645 * This routine copies from a user buffer into a socket,
1646 * and starts the transmit system.
1647 */
1648
1649 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1650 int len, int nonblock, int flags)
1651 {
1652 int copied = 0;
1653 int copy;
1654 int tmp;
1655 int seglen;
1656 int iovct=0;
1657 struct sk_buff *skb;
1658 struct sk_buff *send_tmp;
1659 struct proto *prot;
1660 struct device *dev = NULL;
1661 unsigned char *from;
1662
1663 /*
1664 * Do sanity checking for sendmsg/sendto/send
1665 */
1666
1667 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1668 return -EINVAL;
1669 if (msg->msg_name)
1670 {
1671 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1672 if(sk->state == TCP_CLOSE)
1673 return -ENOTCONN;
1674 if (msg->msg_namelen < sizeof(*addr))
1675 return -EINVAL;
1676 if (addr->sin_family && addr->sin_family != AF_INET)
1677 return -EINVAL;
1678 if (addr->sin_port != sk->dummy_th.dest)
1679 return -EISCONN;
1680 if (addr->sin_addr.s_addr != sk->daddr)
1681 return -EISCONN;
1682 }
1683
1684 /*
1685 * Ok commence sending
1686 */
1687
1688 while(iovct<msg->msg_iovlen)
1689 {
1690 seglen=msg->msg_iov[iovct].iov_len;
1691 from=msg->msg_iov[iovct++].iov_base;
1692 sk->inuse=1;
1693 prot = sk->prot;
1694 while(seglen > 0)
1695 {
1696 if (sk->err)
1697 { /* Stop on an error */
1698 release_sock(sk);
1699 if (copied)
1700 return(copied);
1701 return sock_error(sk);
1702 }
1703
1704 /*
1705 * First thing we do is make sure that we are established.
1706 */
1707
1708 if (sk->shutdown & SEND_SHUTDOWN)
1709 {
1710 release_sock(sk);
1711 sk->err = EPIPE;
1712 if (copied)
1713 return(copied);
1714 sk->err = 0;
1715 return(-EPIPE);
1716 }
1717
1718 /*
1719 * Wait for a connection to finish.
1720 */
1721
1722 while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
1723 {
1724 if (sk->err)
1725 {
1726 release_sock(sk);
1727 if (copied)
1728 return(copied);
1729 return sock_error(sk);
1730 }
1731
1732 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
1733 {
1734 release_sock(sk);
1735 if (copied)
1736 return(copied);
1737
1738 if (sk->err)
1739 return sock_error(sk);
1740
1741 if (sk->keepopen)
1742 {
1743 send_sig(SIGPIPE, current, 0);
1744 }
1745 return(-EPIPE);
1746 }
1747
1748 if (nonblock || copied)
1749 {
1750 release_sock(sk);
1751 if (copied)
1752 return(copied);
1753 return(-EAGAIN);
1754 }
1755
1756 release_sock(sk);
1757 cli();
1758
1759 if (sk->state != TCP_ESTABLISHED &&
1760 sk->state != TCP_CLOSE_WAIT && sk->err == 0)
1761 {
1762 interruptible_sleep_on(sk->sleep);
1763 if (current->signal & ~current->blocked)
1764 {
1765 sti();
1766 if (copied)
1767 return(copied);
1768 return(-ERESTARTSYS);
1769 }
1770 }
1771 sk->inuse = 1;
1772 sti();
1773 }
1774
1775 /*
1776 * The following code can result in copy <= if sk->mss is ever
1777 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
1778 * sk->mtu is constant once SYN processing is finished. I.e. we
1779 * had better not get here until we've seen his SYN and at least one
1780 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
1781 * But ESTABLISHED should guarantee that. sk->max_window is by definition
1782 * non-decreasing. Note that any ioctl to set user_mss must be done
1783 * before the exchange of SYN's. If the initial ack from the other
1784 * end has a window of 0, max_window and thus mss will both be 0.
1785 */
1786
1787 /*
1788 * Now we need to check if we have a half built packet.
1789 */
1790
1791 if ((skb = tcp_dequeue_partial(sk)) != NULL)
1792 {
1793 int hdrlen;
1794
1795 /* IP header + TCP header */
1796 hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
1797 + sizeof(struct tcphdr);
1798
1799 /* Add more stuff to the end of skb->len */
1800 if (!(flags & MSG_OOB))
1801 {
1802 copy = min(sk->mss - (skb->len - hdrlen), len);
1803 /* FIXME: this is really a bug. */
1804 if (copy <= 0)
1805 {
1806 printk("TCP: **bug**: \"copy\" <= 0!!\n");
1807 copy = 0;
1808 }
1809 memcpy_fromfs(skb_put(skb,copy), from, copy);
1810 from += copy;
1811 copied += copy;
1812 len -= copy;
1813 sk->write_seq += copy;
1814 seglen -= copy;
1815 }
1816 if ((skb->len - hdrlen) >= sk->mss ||
1817 (flags & MSG_OOB) || !sk->packets_out)
1818 tcp_send_skb(sk, skb);
1819 else
1820 tcp_enqueue_partial(skb, sk);
1821 continue;
1822 }
1823
1824 /*
1825 * We also need to worry about the window.
1826 * If window < 1/2 the maximum window we've seen from this
1827 * host, don't use it. This is sender side
1828 * silly window prevention, as specified in RFC1122.
1829 * (Note that this is different than earlier versions of
1830 * SWS prevention, e.g. RFC813.). What we actually do is
1831 * use the whole MSS. Since the results in the right
1832 * edge of the packet being outside the window, it will
1833 * be queued for later rather than sent.
1834 */
1835
1836 copy = sk->window_seq - sk->write_seq;
1837 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1838 copy = sk->mss;
1839 if (copy > len)
1840 copy = len;
1841
1842 /*
1843 * We should really check the window here also.
1844 */
1845
1846 send_tmp = NULL;
1847 if (copy < sk->mss && !(flags & MSG_OOB))
1848 {
1849 /*
1850 * We will release the socket in case we sleep here.
1851 */
1852 release_sock(sk);
1853 /*
1854 * NB: following must be mtu, because mss can be increased.
1855 * mss is always <= mtu
1856 */
1857 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1858 sk->inuse = 1;
1859 send_tmp = skb;
1860 }
1861 else
1862 {
1863 /*
1864 * We will release the socket in case we sleep here.
1865 */
1866 release_sock(sk);
1867 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1868 sk->inuse = 1;
1869 }
1870
1871 /*
1872 * If we didn't get any memory, we need to sleep.
1873 */
1874
1875 if (skb == NULL)
1876 {
1877 sk->socket->flags |= SO_NOSPACE;
1878 if (nonblock)
1879 {
1880 release_sock(sk);
1881 if (copied)
1882 return(copied);
1883 return(-EAGAIN);
1884 }
1885
1886 /*
1887 * FIXME: here is another race condition.
1888 */
1889
1890 tmp = sk->wmem_alloc;
1891 release_sock(sk);
1892 cli();
1893 /*
1894 * Again we will try to avoid it.
1895 */
1896 if (tmp <= sk->wmem_alloc &&
1897 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
1898 && sk->err == 0)
1899 {
1900 sk->socket->flags &= ~SO_NOSPACE;
1901 interruptible_sleep_on(sk->sleep);
1902 if (current->signal & ~current->blocked)
1903 {
1904 sti();
1905 if (copied)
1906 return(copied);
1907 return(-ERESTARTSYS);
1908 }
1909 }
1910 sk->inuse = 1;
1911 sti();
1912 continue;
1913 }
1914
1915 skb->sk = sk;
1916 skb->free = 0;
1917 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1918
1919 /*
1920 * FIXME: we need to optimize this.
1921 * Perhaps some hints here would be good.
1922 */
1923
1924 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1925 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl);
1926 if (tmp < 0 )
1927 {
1928 sock_wfree(sk, skb);
1929 release_sock(sk);
1930 if (copied)
1931 return(copied);
1932 return(tmp);
1933 }
1934 skb->dev = dev;
1935 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1936 tmp = tcp_build_header(skb->h.th, sk, len-copy);
1937 if (tmp < 0)
1938 {
1939 sock_wfree(sk, skb);
1940 release_sock(sk);
1941 if (copied)
1942 return(copied);
1943 return(tmp);
1944 }
1945
1946 if (flags & MSG_OOB)
1947 {
1948 skb->h.th->urg = 1;
1949 skb->h.th->urg_ptr = ntohs(copy);
1950 }
1951
1952 memcpy_fromfs(skb_put(skb,copy), from, copy);
1953
1954 from += copy;
1955 copied += copy;
1956 len -= copy;
1957 seglen -= copy;
1958 skb->free = 0;
1959 sk->write_seq += copy;
1960
1961 if (send_tmp != NULL && sk->packets_out)
1962 {
1963 tcp_enqueue_partial(send_tmp, sk);
1964 continue;
1965 }
1966 tcp_send_skb(sk, skb);
1967 }
1968 }
1969 sk->err = 0;
1970
1971 /*
1972 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1973 * interactive fast network servers. It's meant to be on and
1974 * it really improves the throughput though not the echo time
1975 * on my slow slip link - Alan
1976 */
1977
1978 /*
1979 * Avoid possible race on send_tmp - c/o Johannes Stille
1980 */
1981
1982 if(sk->partial && ((!sk->packets_out)
1983 /* If not nagling we can send on the before case too.. */
1984 || (sk->nonagle && before(sk->write_seq , sk->window_seq))
1985 ))
1986 tcp_send_partial(sk);
1987
1988 release_sock(sk);
1989 return(copied);
1990 }
1991
1992 /*
1993 * Send an ack if one is backlogged at this point. Ought to merge
1994 * this with tcp_send_ack().
1995 */
1996
1997 static void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1998 {
1999 int tmp;
2000 struct device *dev = NULL;
2001 struct tcphdr *t1;
2002 struct sk_buff *buff;
2003
2004 if (!sk->ack_backlog)
2005 return;
2006
2007 /*
2008 * If we're closed, don't send an ack, or we'll get a RST
2009 * from the closed destination.
2010 */
2011 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
2012 return;
2013
2014 /*
2015 * FIXME: we need to put code here to prevent this routine from
2016 * being called. Being called once in a while is ok, so only check
2017 * if this is the second time in a row.
2018 */
2019
2020 /*
2021 * We need to grab some memory, and put together an ack,
2022 * and then put it into the queue to be sent.
2023 */
2024
2025 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
2026 if (buff == NULL)
2027 {
2028 /* Try again real soon. */
2029 reset_xmit_timer(sk, TIME_WRITE, HZ);
2030 return;
2031 }
2032
2033 buff->sk = sk;
2034 buff->localroute = sk->localroute;
2035
2036 /*
2037 * Put in the IP header and routing stuff.
2038 */
2039
2040 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2041 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
2042 if (tmp < 0)
2043 {
2044 buff->free = 1;
2045 sock_wfree(sk, buff);
2046 return;
2047 }
2048
2049 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2050
2051 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
2052 t1->seq = htonl(sk->sent_seq);
2053 t1->ack = 1;
2054 t1->res1 = 0;
2055 t1->res2 = 0;
2056 t1->rst = 0;
2057 t1->urg = 0;
2058 t1->syn = 0;
2059 t1->psh = 0;
2060 sk->ack_backlog = 0;
2061 sk->bytes_rcv = 0;
2062 sk->window = tcp_select_window(sk);
2063 t1->window = ntohs(sk->window);
2064 t1->ack_seq = ntohl(sk->acked_seq);
2065 t1->doff = sizeof(*t1)/4;
2066 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2067 sk->prot->queue_xmit(sk, dev, buff, 1);
2068 tcp_statistics.TcpOutSegs++;
2069 }
2070
2071
2072 /*
2073 * FIXME:
2074 * This routine frees used buffers.
2075 * It should consider sending an ACK to let the
2076 * other end know we now have a bigger window.
2077 */
2078
2079 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2080 {
2081 unsigned long flags;
2082 unsigned long left;
2083 struct sk_buff *skb;
2084 unsigned long rspace;
2085
2086 if(sk->debug)
2087 printk("cleaning rbuf for sk=%p\n", sk);
2088
2089 save_flags(flags);
2090 cli();
2091
2092 left = sock_rspace(sk);
2093
2094 /*
2095 * We have to loop through all the buffer headers,
2096 * and try to free up all the space we can.
2097 */
2098
2099 while((skb=skb_peek(&sk->receive_queue)) != NULL)
2100 {
2101 if (!skb->used || skb->users)
2102 break;
2103 skb_unlink(skb);
2104 skb->sk = sk;
2105 kfree_skb(skb, FREE_READ);
2106 }
2107
2108 restore_flags(flags);
2109
2110 /*
2111 * FIXME:
2112 * At this point we should send an ack if the difference
2113 * in the window, and the amount of space is bigger than
2114 * TCP_WINDOW_DIFF.
2115 */
2116
2117 if(sk->debug)
2118 printk("sk->rspace = %lu, was %lu\n", sock_rspace(sk),
2119 left);
2120 if ((rspace=sock_rspace(sk)) != left)
2121 {
2122 /*
2123 * This area has caused the most trouble. The current strategy
2124 * is to simply do nothing if the other end has room to send at
2125 * least 3 full packets, because the ack from those will auto-
2126 * matically update the window. If the other end doesn't think
2127 * we have much space left, but we have room for at least 1 more
2128 * complete packet than it thinks we do, we will send an ack
2129 * immediately. Otherwise we will wait up to .5 seconds in case
2130 * the user reads some more.
2131 */
2132 sk->ack_backlog++;
2133 /*
2134 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
2135 * if the other end is offering a window smaller than the agreed on MSS
2136 * (called sk->mtu here). In theory there's no connection between send
2137 * and receive, and so no reason to think that they're going to send
2138 * small packets. For the moment I'm using the hack of reducing the mss
2139 * only on the send side, so I'm putting mtu here.
2140 */
2141
2142 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
2143 {
2144 /* Send an ack right now. */
2145 tcp_read_wakeup(sk);
2146 }
2147 else
2148 {
2149 /* Force it to send an ack soon. */
2150 int was_active = del_timer(&sk->retransmit_timer);
2151 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
2152 {
2153 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
2154 }
2155 else
2156 add_timer(&sk->retransmit_timer);
2157 }
2158 }
2159 }
2160
2161
2162 /*
2163 * Handle reading urgent data. BSD has very simple semantics for
2164 * this, no blocking and very strange errors 8)
2165 */
2166
2167 static int tcp_recv_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2168 struct msghdr *msg, int len, int flags, int *addr_len)
2169 {
2170 /*
2171 * No URG data to read
2172 */
2173 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
2174 return -EINVAL; /* Yes this is right ! */
2175
2176 if (sk->err)
2177 return sock_error(sk);
2178
2179 if (sk->state == TCP_CLOSE || sk->done)
2180 {
2181 if (!sk->done)
2182 {
2183 sk->done = 1;
2184 return 0;
2185 }
2186 return -ENOTCONN;
2187 }
2188
2189 if (sk->shutdown & RCV_SHUTDOWN)
2190 {
2191 sk->done = 1;
2192 return 0;
2193 }
2194 sk->inuse = 1;
2195 if (sk->urg_data & URG_VALID)
2196 {
2197 char c = sk->urg_data;
2198 if (!(flags & MSG_PEEK))
2199 sk->urg_data = URG_READ;
2200 memcpy_toiovec(msg->msg_iov, &c, 1);
2201 if(msg->msg_name)
2202 {
2203 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2204 sin->sin_family=AF_INET;
2205 sin->sin_addr.s_addr=sk->daddr;
2206 sin->sin_port=sk->dummy_th.dest;
2207 }
2208 if(addr_len)
2209 *addr_len=sizeof(struct sockaddr_in);
2210 release_sock(sk);
2211 return 1;
2212 }
2213 release_sock(sk);
2214
2215 /*
2216 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
2217 * the available implementations agree in this case:
2218 * this call should never block, independent of the
2219 * blocking state of the socket.
2220 * Mike <pall@rz.uni-karlsruhe.de>
2221 */
2222 return -EAGAIN;
2223 }
2224
2225
2226 /*
2227 * This routine copies from a sock struct into the user buffer.
2228 */
2229
2230 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2231 int len, int nonblock, int flags, int *addr_len)
2232 {
2233 struct wait_queue wait = { current, NULL };
2234 int copied = 0;
2235 u32 peek_seq;
2236 volatile u32 *seq; /* So gcc doesn't overoptimise */
2237 unsigned long used;
2238
2239 /*
2240 * This error should be checked.
2241 */
2242
2243 if (sk->state == TCP_LISTEN)
2244 return -ENOTCONN;
2245
2246 /*
2247 * Urgent data needs to be handled specially.
2248 */
2249
2250 if (flags & MSG_OOB)
2251 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
2252
2253 /*
2254 * Copying sequence to update. This is volatile to handle
2255 * the multi-reader case neatly (memcpy_to/fromfs might be
2256 * inline and thus not flush cached variables otherwise).
2257 */
2258
2259 peek_seq = sk->copied_seq;
2260 seq = &sk->copied_seq;
2261 if (flags & MSG_PEEK)
2262 seq = &peek_seq;
2263
2264 add_wait_queue(sk->sleep, &wait);
2265 sk->inuse = 1;
2266 while (len > 0)
2267 {
2268 struct sk_buff * skb;
2269 u32 offset;
2270
2271 /*
2272 * Are we at urgent data? Stop if we have read anything.
2273 */
2274
2275 if (copied && sk->urg_data && sk->urg_seq == *seq)
2276 break;
2277
2278 /*
2279 * Next get a buffer.
2280 */
2281
2282 current->state = TASK_INTERRUPTIBLE;
2283
2284 skb = skb_peek(&sk->receive_queue);
2285 do
2286 {
2287 if (!skb)
2288 break;
2289 if (before(*seq, skb->h.th->seq))
2290 break;
2291 offset = *seq - skb->h.th->seq;
2292 if (skb->h.th->syn)
2293 offset--;
2294 if (offset < skb->len)
2295 goto found_ok_skb;
2296 if (skb->h.th->fin)
2297 goto found_fin_ok;
2298 if (!(flags & MSG_PEEK))
2299 skb->used = 1;
2300 skb = skb->next;
2301 }
2302 while (skb != (struct sk_buff *)&sk->receive_queue);
2303
2304 if (copied)
2305 break;
2306
2307 if (sk->err)
2308 {
2309 copied = -xchg(&sk->err,0);
2310 break;
2311 }
2312
2313 if (sk->state == TCP_CLOSE)
2314 {
2315 if (!sk->done)
2316 {
2317 sk->done = 1;
2318 break;
2319 }
2320 copied = -ENOTCONN;
2321 break;
2322 }
2323
2324 if (sk->shutdown & RCV_SHUTDOWN)
2325 {
2326 sk->done = 1;
2327 break;
2328 }
2329
2330 if (nonblock)
2331 {
2332 copied = -EAGAIN;
2333 break;
2334 }
2335
2336 cleanup_rbuf(sk);
2337 release_sock(sk);
2338 sk->socket->flags |= SO_WAITDATA;
2339 schedule();
2340 sk->socket->flags &= ~SO_WAITDATA;
2341 sk->inuse = 1;
2342
2343 if (current->signal & ~current->blocked)
2344 {
2345 copied = -ERESTARTSYS;
2346 break;
2347 }
2348 continue;
2349
2350 found_ok_skb:
2351 /*
2352 * Lock the buffer. We can be fairly relaxed as
2353 * an interrupt will never steal a buffer we are
2354 * using unless I've missed something serious in
2355 * tcp_data.
2356 */
2357
2358 skb->users++;
2359
2360 /*
2361 * Ok so how much can we use ?
2362 */
2363
2364 used = skb->len - offset;
2365 if (len < used)
2366 used = len;
2367 /*
2368 * Do we have urgent data here?
2369 */
2370
2371 if (sk->urg_data)
2372 {
2373 u32 urg_offset = sk->urg_seq - *seq;
2374 if (urg_offset < used)
2375 {
2376 if (!urg_offset)
2377 {
2378 if (!sk->urginline)
2379 {
2380 ++*seq;
2381 offset++;
2382 used--;
2383 }
2384 }
2385 else
2386 used = urg_offset;
2387 }
2388 }
2389
2390 /*
2391 * Copy it - We _MUST_ update *seq first so that we
2392 * don't ever double read when we have dual readers
2393 */
2394
2395 *seq += used;
2396
2397 /*
2398 * This memcpy_tofs can sleep. If it sleeps and we
2399 * do a second read it relies on the skb->users to avoid
2400 * a crash when cleanup_rbuf() gets called.
2401 */
2402
2403 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
2404 skb->h.th->doff*4 + offset, used);
2405 copied += used;
2406 len -= used;
2407
2408 /*
2409 * We now will not sleep again until we are finished
2410 * with skb. Sorry if you are doing the SMP port
2411 * but you'll just have to fix it neatly ;)
2412 */
2413
2414 skb->users --;
2415
2416 if (after(sk->copied_seq,sk->urg_seq))
2417 sk->urg_data = 0;
2418 if (used + offset < skb->len)
2419 continue;
2420
2421 /*
2422 * Process the FIN.
2423 */
2424
2425 if (skb->h.th->fin)
2426 goto found_fin_ok;
2427 if (flags & MSG_PEEK)
2428 continue;
2429 skb->used = 1;
2430 continue;
2431
2432 found_fin_ok:
2433 ++*seq;
2434 if (flags & MSG_PEEK)
2435 break;
2436
2437 /*
2438 * All is done
2439 */
2440
2441 skb->used = 1;
2442 sk->shutdown |= RCV_SHUTDOWN;
2443 break;
2444
2445 }
2446
2447 if(copied>0 && msg->msg_name)
2448 {
2449 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
2450 sin->sin_family=AF_INET;
2451 sin->sin_addr.s_addr=sk->daddr;
2452 sin->sin_port=sk->dummy_th.dest;
2453 }
2454 if(addr_len)
2455 *addr_len=sizeof(struct sockaddr_in);
2456
2457 remove_wait_queue(sk->sleep, &wait);
2458 current->state = TASK_RUNNING;
2459
2460 /* Clean up data we have read: This will do ACK frames */
2461 cleanup_rbuf(sk);
2462 release_sock(sk);
2463 return copied;
2464 }
2465
2466
2467
2468 /*
2469 * State processing on a close. This implements the state shift for
2470 * sending our FIN frame. Note that we only send a FIN for some
2471 * states. A shutdown() may have already sent the FIN, or we may be
2472 * closed.
2473 */
2474
2475 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2476 {
2477 int ns=TCP_CLOSE;
2478 int send_fin=0;
2479 switch(sk->state)
2480 {
2481 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
2482 break;
2483 case TCP_SYN_RECV:
2484 case TCP_ESTABLISHED: /* Closedown begin */
2485 ns=TCP_FIN_WAIT1;
2486 send_fin=1;
2487 break;
2488 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
2489 case TCP_FIN_WAIT2:
2490 case TCP_CLOSING:
2491 ns=sk->state;
2492 break;
2493 case TCP_CLOSE:
2494 case TCP_LISTEN:
2495 break;
2496 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
2497 wait only for the ACK */
2498 ns=TCP_LAST_ACK;
2499 send_fin=1;
2500 }
2501
2502 tcp_set_state(sk,ns);
2503
2504 /*
2505 * This is a (useful) BSD violating of the RFC. There is a
2506 * problem with TCP as specified in that the other end could
2507 * keep a socket open forever with no application left this end.
2508 * We use a 3 minute timeout (about the same as BSD) then kill
2509 * our end. If they send after that then tough - BUT: long enough
2510 * that we won't make the old 4*rto = almost no time - whoops
2511 * reset mistake.
2512 */
2513 if(dead && ns==TCP_FIN_WAIT2)
2514 {
2515 int timer_active=del_timer(&sk->timer);
2516 if(timer_active)
2517 add_timer(&sk->timer);
2518 else
2519 reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
2520 }
2521
2522 return send_fin;
2523 }
2524
2525 /*
2526 * Send a fin.
2527 */
2528
2529 static void tcp_send_fin(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2530 {
2531 struct proto *prot =(struct proto *)sk->prot;
2532 struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
2533 struct tcphdr *t1;
2534 struct sk_buff *buff;
2535 struct device *dev=NULL;
2536 int tmp;
2537
2538 release_sock(sk); /* in case the malloc sleeps. */
2539
2540 buff = sock_wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL);
2541 sk->inuse = 1;
2542
2543 if (buff == NULL)
2544 {
2545 /* This is a disaster if it occurs */
2546 printk("tcp_send_fin: Impossible malloc failure");
2547 return;
2548 }
2549
2550 /*
2551 * Administrivia
2552 */
2553
2554 buff->sk = sk;
2555 buff->localroute = sk->localroute;
2556
2557 /*
2558 * Put in the IP header and routing stuff.
2559 */
2560
2561 tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev,
2562 IPPROTO_TCP, sk->opt,
2563 sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl);
2564 if (tmp < 0)
2565 {
2566 int t;
2567 /*
2568 * Finish anyway, treat this as a send that got lost.
2569 * (Not good).
2570 */
2571
2572 buff->free = 1;
2573 sock_wfree(sk,buff);
2574 sk->write_seq++;
2575 t=del_timer(&sk->timer);
2576 if(t)
2577 add_timer(&sk->timer);
2578 else
2579 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
2580 return;
2581 }
2582
2583 /*
2584 * We ought to check if the end of the queue is a buffer and
2585 * if so simply add the fin to that buffer, not send it ahead.
2586 */
2587
2588 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2589 buff->dev = dev;
2590 memcpy(t1, th, sizeof(*t1));
2591 t1->seq = ntohl(sk->write_seq);
2592 sk->write_seq++;
2593 buff->h.seq = sk->write_seq;
2594 t1->ack = 1;
2595 t1->ack_seq = ntohl(sk->acked_seq);
2596 t1->window = ntohs(sk->window=tcp_select_window(sk));
2597 t1->fin = 1;
2598 t1->rst = 0;
2599 t1->doff = sizeof(*t1)/4;
2600 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
2601
2602 /*
2603 * If there is data in the write queue, the fin must be appended to
2604 * the write queue.
2605 */
2606
2607 if (skb_peek(&sk->write_queue) != NULL)
2608 {
2609 buff->free = 0;
2610 if (buff->next != NULL)
2611 {
2612 printk("tcp_send_fin: next != NULL\n");
2613 skb_unlink(buff);
2614 }
2615 skb_queue_tail(&sk->write_queue, buff);
2616 }
2617 else
2618 {
2619 sk->sent_seq = sk->write_seq;
2620 sk->prot->queue_xmit(sk, dev, buff, 0);
2621 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2622 }
2623 }
2624
2625 /*
2626 * Shutdown the sending side of a connection. Much like close except
2627 * that we don't receive shut down or set sk->dead=1.
2628 */
2629
2630 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2631 {
2632 /*
2633 * We need to grab some memory, and put together a FIN,
2634 * and then put it into the queue to be sent.
2635 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2636 */
2637
2638 if (!(how & SEND_SHUTDOWN))
2639 return;
2640
2641 /*
2642 * If we've already sent a FIN, or it's a closed state
2643 */
2644
2645 if (sk->state == TCP_FIN_WAIT1 ||
2646 sk->state == TCP_FIN_WAIT2 ||
2647 sk->state == TCP_CLOSING ||
2648 sk->state == TCP_LAST_ACK ||
2649 sk->state == TCP_TIME_WAIT ||
2650 sk->state == TCP_CLOSE ||
2651 sk->state == TCP_LISTEN
2652 )
2653 {
2654 return;
2655 }
2656 sk->inuse = 1;
2657
2658 /*
2659 * flag that the sender has shutdown
2660 */
2661
2662 sk->shutdown |= SEND_SHUTDOWN;
2663
2664 /*
2665 * Clear out any half completed packets.
2666 */
2667
2668 if (sk->partial)
2669 tcp_send_partial(sk);
2670
2671 /*
2672 * FIN if needed
2673 */
2674
2675 if(tcp_close_state(sk,0))
2676 tcp_send_fin(sk);
2677
2678 release_sock(sk);
2679 }
2680
2681 /*
2682 * This routine will send an RST to the other tcp.
2683 */
2684
2685 static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2686 struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl)
2687 {
2688 struct sk_buff *buff;
2689 struct tcphdr *t1;
2690 int tmp;
2691 struct device *ndev=NULL;
2692
2693 /*
2694 * Cannot reset a reset (Think about it).
2695 */
2696
2697 if(th->rst)
2698 return;
2699
2700 /*
2701 * We need to grab some memory, and put together an RST,
2702 * and then put it into the queue to be sent.
2703 */
2704
2705 buff = sock_wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC);
2706 if (buff == NULL)
2707 return;
2708
2709 buff->sk = NULL;
2710 buff->dev = dev;
2711 buff->localroute = 0;
2712
2713 /*
2714 * Put in the IP header and routing stuff.
2715 */
2716
2717 tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt,
2718 sizeof(struct tcphdr),tos,ttl);
2719 if (tmp < 0)
2720 {
2721 buff->free = 1;
2722 sock_wfree(NULL, buff);
2723 return;
2724 }
2725
2726 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
2727 memcpy(t1, th, sizeof(*t1));
2728
2729 /*
2730 * Swap the send and the receive.
2731 */
2732
2733 t1->dest = th->source;
2734 t1->source = th->dest;
2735 t1->rst = 1;
2736 t1->window = 0;
2737
2738 if(th->ack)
2739 {
2740 t1->ack = 0;
2741 t1->seq = th->ack_seq;
2742 t1->ack_seq = 0;
2743 }
2744 else
2745 {
2746 t1->ack = 1;
2747 if(!th->syn)
2748 t1->ack_seq=htonl(th->seq);
2749 else
2750 t1->ack_seq=htonl(th->seq+1);
2751 t1->seq=0;
2752 }
2753
2754 t1->syn = 0;
2755 t1->urg = 0;
2756 t1->fin = 0;
2757 t1->psh = 0;
2758 t1->doff = sizeof(*t1)/4;
2759 tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL);
2760 prot->queue_xmit(NULL, ndev, buff, 1);
2761 tcp_statistics.TcpOutSegs++;
2762 }
2763
2764
2765 /*
2766 * Look for tcp options. Parses everything but only knows about MSS.
2767 * This routine is always called with the packet containing the SYN.
2768 * However it may also be called with the ack to the SYN. So you
2769 * can't assume this is always the SYN. It's always called after
2770 * we have set up sk->mtu to our own MTU.
2771 *
2772 * We need at minimum to add PAWS support here. Possibly large windows
2773 * as Linux gets deployed on 100Mb/sec networks.
2774 */
2775
2776 static void tcp_options(struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2777 {
2778 unsigned char *ptr;
2779 int length=(th->doff*4)-sizeof(struct tcphdr);
2780 int mss_seen = 0;
2781
2782 ptr = (unsigned char *)(th + 1);
2783
2784 while(length>0)
2785 {
2786 int opcode=*ptr++;
2787 int opsize=*ptr++;
2788 switch(opcode)
2789 {
2790 case TCPOPT_EOL:
2791 return;
2792 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
2793 length--;
2794 ptr--; /* the opsize=*ptr++ above was a mistake */
2795 continue;
2796
2797 default:
2798 if(opsize<=2) /* Avoid silly options looping forever */
2799 return;
2800 switch(opcode)
2801 {
2802 case TCPOPT_MSS:
2803 if(opsize==4 && th->syn)
2804 {
2805 sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
2806 mss_seen = 1;
2807 }
2808 break;
2809 /* Add other options here as people feel the urge to implement stuff like large windows */
2810 }
2811 ptr+=opsize-2;
2812 length-=opsize;
2813 }
2814 }
2815 if (th->syn)
2816 {
2817 if (! mss_seen)
2818 sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
2819 }
2820 #ifdef CONFIG_INET_PCTCP
2821 sk->mss = min(sk->max_window >> 1, sk->mtu);
2822 #else
2823 sk->mss = min(sk->max_window, sk->mtu);
2824 #endif
2825 }
2826
2827 static inline unsigned long default_mask(unsigned long dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2828 {
2829 dst = ntohl(dst);
2830 if (IN_CLASSA(dst))
2831 return htonl(IN_CLASSA_NET);
2832 if (IN_CLASSB(dst))
2833 return htonl(IN_CLASSB_NET);
2834 return htonl(IN_CLASSC_NET);
2835 }
2836
2837 /*
2838 * Default sequence number picking algorithm.
2839 * As close as possible to RFC 793, which
2840 * suggests using a 250kHz clock.
2841 * Further reading shows this assumes 2MB/s networks.
2842 * For 10MB/s ethernet, a 1MHz clock is appropriate.
2843 * That's funny, Linux has one built in! Use it!
2844 */
2845
2846 extern inline u32 tcp_init_seq(void)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2847 {
2848 struct timeval tv;
2849 do_gettimeofday(&tv);
2850 return tv.tv_usec+tv.tv_sec*1000000;
2851 }
2852
2853 /*
2854 * This routine handles a connection request.
2855 * It should make sure we haven't already responded.
2856 * Because of the way BSD works, we have to send a syn/ack now.
2857 * This also means it will be harder to close a socket which is
2858 * listening.
2859 */
2860
2861 static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2862 unsigned long daddr, unsigned long saddr,
2863 struct options *opt, struct device *dev, u32 seq)
2864 {
2865 struct sk_buff *buff;
2866 struct tcphdr *t1;
2867 unsigned char *ptr;
2868 struct sock *newsk;
2869 struct tcphdr *th;
2870 struct device *ndev=NULL;
2871 int tmp;
2872 struct rtable *rt;
2873
2874 th = skb->h.th;
2875
2876 /* If the socket is dead, don't accept the connection. */
2877 if (!sk->dead)
2878 {
2879 sk->data_ready(sk,0);
2880 }
2881 else
2882 {
2883 if(sk->debug)
2884 printk("Reset on %p: Connect on dead socket.\n",sk);
2885 tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
2886 tcp_statistics.TcpAttemptFails++;
2887 kfree_skb(skb, FREE_READ);
2888 return;
2889 }
2890
2891 /*
2892 * Make sure we can accept more. This will prevent a
2893 * flurry of syns from eating up all our memory.
2894 */
2895
2896 if (sk->ack_backlog >= sk->max_ack_backlog)
2897 {
2898 tcp_statistics.TcpAttemptFails++;
2899 kfree_skb(skb, FREE_READ);
2900 return;
2901 }
2902
2903 /*
2904 * We need to build a new sock struct.
2905 * It is sort of bad to have a socket without an inode attached
2906 * to it, but the wake_up's will just wake up the listening socket,
2907 * and if the listening socket is destroyed before this is taken
2908 * off of the queue, this will take care of it.
2909 */
2910
2911 newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
2912 if (newsk == NULL)
2913 {
2914 /* just ignore the syn. It will get retransmitted. */
2915 tcp_statistics.TcpAttemptFails++;
2916 kfree_skb(skb, FREE_READ);
2917 return;
2918 }
2919
2920 memcpy(newsk, sk, sizeof(*newsk));
2921 newsk->opt = NULL;
2922 if (opt && opt->optlen) {
2923 sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
2924 if (!sk->opt) {
2925 kfree_s(newsk, sizeof(struct sock));
2926 tcp_statistics.TcpAttemptFails++;
2927 kfree_skb(skb, FREE_READ);
2928 return;
2929 }
2930 if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) {
2931 kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
2932 kfree_s(newsk, sizeof(struct sock));
2933 tcp_statistics.TcpAttemptFails++;
2934 kfree_skb(skb, FREE_READ);
2935 return;
2936 }
2937 }
2938 skb_queue_head_init(&newsk->write_queue);
2939 skb_queue_head_init(&newsk->receive_queue);
2940 newsk->send_head = NULL;
2941 newsk->send_tail = NULL;
2942 skb_queue_head_init(&newsk->back_log);
2943 newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/
2944 newsk->rto = TCP_TIMEOUT_INIT;
2945 newsk->mdev = 0;
2946 newsk->max_window = 0;
2947 newsk->cong_window = 1;
2948 newsk->cong_count = 0;
2949 newsk->ssthresh = 0;
2950 newsk->backoff = 0;
2951 newsk->blog = 0;
2952 newsk->intr = 0;
2953 newsk->proc = 0;
2954 newsk->done = 0;
2955 newsk->partial = NULL;
2956 newsk->pair = NULL;
2957 newsk->wmem_alloc = 0;
2958 newsk->rmem_alloc = 0;
2959 newsk->localroute = sk->localroute;
2960
2961 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
2962
2963 newsk->err = 0;
2964 newsk->shutdown = 0;
2965 newsk->ack_backlog = 0;
2966 newsk->acked_seq = skb->h.th->seq+1;
2967 newsk->copied_seq = skb->h.th->seq+1;
2968 newsk->fin_seq = skb->h.th->seq;
2969 newsk->state = TCP_SYN_RECV;
2970 newsk->timeout = 0;
2971 newsk->ip_xmit_timeout = 0;
2972 newsk->write_seq = seq;
2973 newsk->window_seq = newsk->write_seq;
2974 newsk->rcv_ack_seq = newsk->write_seq;
2975 newsk->urg_data = 0;
2976 newsk->retransmits = 0;
2977 newsk->linger=0;
2978 newsk->destroy = 0;
2979 init_timer(&newsk->timer);
2980 newsk->timer.data = (unsigned long)newsk;
2981 newsk->timer.function = &net_timer;
2982 init_timer(&newsk->retransmit_timer);
2983 newsk->retransmit_timer.data = (unsigned long)newsk;
2984 newsk->retransmit_timer.function=&retransmit_timer;
2985 newsk->dummy_th.source = skb->h.th->dest;
2986 newsk->dummy_th.dest = skb->h.th->source;
2987
2988 /*
2989 * Swap these two, they are from our point of view.
2990 */
2991
2992 newsk->daddr = saddr;
2993 newsk->saddr = daddr;
2994 newsk->rcv_saddr = daddr;
2995
2996 put_sock(newsk->num,newsk);
2997 newsk->dummy_th.res1 = 0;
2998 newsk->dummy_th.doff = 6;
2999 newsk->dummy_th.fin = 0;
3000 newsk->dummy_th.syn = 0;
3001 newsk->dummy_th.rst = 0;
3002 newsk->dummy_th.psh = 0;
3003 newsk->dummy_th.ack = 0;
3004 newsk->dummy_th.urg = 0;
3005 newsk->dummy_th.res2 = 0;
3006 newsk->acked_seq = skb->h.th->seq + 1;
3007 newsk->copied_seq = skb->h.th->seq + 1;
3008 newsk->socket = NULL;
3009
3010 /*
3011 * Grab the ttl and tos values and use them
3012 */
3013
3014 newsk->ip_ttl=sk->ip_ttl;
3015 newsk->ip_tos=skb->ip_hdr->tos;
3016
3017 /*
3018 * Use 512 or whatever user asked for
3019 */
3020
3021 /*
3022 * Note use of sk->user_mss, since user has no direct access to newsk
3023 */
3024
3025 rt=ip_rt_route(saddr, NULL,NULL);
3026
3027 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
3028 newsk->window_clamp = rt->rt_window;
3029 else
3030 newsk->window_clamp = 0;
3031
3032 if (sk->user_mss)
3033 newsk->mtu = sk->user_mss;
3034 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
3035 newsk->mtu = rt->rt_mss - sizeof(struct iphdr) - sizeof(struct tcphdr);
3036 else
3037 {
3038 #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */
3039 if ((saddr ^ daddr) & default_mask(saddr))
3040 #else
3041 if ((saddr ^ daddr) & dev->pa_mask)
3042 #endif
3043 newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
3044 else
3045 newsk->mtu = MAX_WINDOW;
3046 }
3047
3048 /*
3049 * But not bigger than device MTU
3050 */
3051
3052 newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
3053
3054 /*
3055 * This will min with what arrived in the packet
3056 */
3057
3058 tcp_options(newsk,skb->h.th);
3059
3060 tcp_cache_zap();
3061
3062 buff = sock_wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
3063 if (buff == NULL)
3064 {
3065 sk->err = ENOMEM;
3066 newsk->dead = 1;
3067 newsk->state = TCP_CLOSE;
3068 /* And this will destroy it */
3069 release_sock(newsk);
3070 kfree_skb(skb, FREE_READ);
3071 tcp_statistics.TcpAttemptFails++;
3072 return;
3073 }
3074
3075 buff->sk = newsk;
3076 buff->localroute = newsk->localroute;
3077
3078 /*
3079 * Put in the IP header and routing stuff.
3080 */
3081
3082 tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
3083 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
3084
3085 /*
3086 * Something went wrong.
3087 */
3088
3089 if (tmp < 0)
3090 {
3091 sk->err = tmp;
3092 buff->free = 1;
3093 kfree_skb(buff,FREE_WRITE);
3094 newsk->dead = 1;
3095 newsk->state = TCP_CLOSE;
3096 release_sock(newsk);
3097 skb->sk = sk;
3098 kfree_skb(skb, FREE_READ);
3099 tcp_statistics.TcpAttemptFails++;
3100 return;
3101 }
3102
3103 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
3104
3105 memcpy(t1, skb->h.th, sizeof(*t1));
3106 buff->h.seq = newsk->write_seq;
3107 /*
3108 * Swap the send and the receive.
3109 */
3110 t1->dest = skb->h.th->source;
3111 t1->source = newsk->dummy_th.source;
3112 t1->seq = ntohl(newsk->write_seq++);
3113 t1->ack = 1;
3114 newsk->window = tcp_select_window(newsk);
3115 newsk->sent_seq = newsk->write_seq;
3116 t1->window = ntohs(newsk->window);
3117 t1->res1 = 0;
3118 t1->res2 = 0;
3119 t1->rst = 0;
3120 t1->urg = 0;
3121 t1->psh = 0;
3122 t1->syn = 1;
3123 t1->ack_seq = ntohl(skb->h.th->seq+1);
3124 t1->doff = sizeof(*t1)/4+1;
3125 ptr = skb_put(buff,4);
3126 ptr[0] = 2;
3127 ptr[1] = 4;
3128 ptr[2] = ((newsk->mtu) >> 8) & 0xff;
3129 ptr[3] =(newsk->mtu) & 0xff;
3130
3131 tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
3132 newsk->prot->queue_xmit(newsk, ndev, buff, 0);
3133 reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
3134 skb->sk = newsk;
3135
3136 /*
3137 * Charge the sock_buff to newsk.
3138 */
3139
3140 sk->rmem_alloc -= skb->truesize;
3141 newsk->rmem_alloc += skb->truesize;
3142
3143 skb_queue_tail(&sk->receive_queue,skb);
3144 sk->ack_backlog++;
3145 release_sock(newsk);
3146 tcp_statistics.TcpOutSegs++;
3147 }
3148
3149
3150 static void tcp_close(struct sock *sk, int timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3151 {
3152 /*
3153 * We need to grab some memory, and put together a FIN,
3154 * and then put it into the queue to be sent.
3155 */
3156
3157 sk->inuse = 1;
3158
3159 if(th_cache_sk==sk)
3160 tcp_cache_zap();
3161 if(sk->state == TCP_LISTEN)
3162 {
3163 /* Special case */
3164 tcp_set_state(sk, TCP_CLOSE);
3165 tcp_close_pending(sk);
3166 release_sock(sk);
3167 return;
3168 }
3169
3170 sk->keepopen = 1;
3171 sk->shutdown = SHUTDOWN_MASK;
3172
3173 if (!sk->dead)
3174 sk->state_change(sk);
3175
3176 if (timeout == 0)
3177 {
3178 struct sk_buff *skb;
3179
3180 /*
3181 * We need to flush the recv. buffs. We do this only on the
3182 * descriptor close, not protocol-sourced closes, because the
3183 * reader process may not have drained the data yet!
3184 */
3185
3186 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
3187 kfree_skb(skb, FREE_READ);
3188 /*
3189 * Get rid off any half-completed packets.
3190 */
3191
3192 if (sk->partial)
3193 tcp_send_partial(sk);
3194 }
3195
3196
3197 /*
3198 * Timeout is not the same thing - however the code likes
3199 * to send both the same way (sigh).
3200 */
3201
3202 if(timeout)
3203 {
3204 tcp_set_state(sk, TCP_CLOSE); /* Dead */
3205 }
3206 else
3207 {
3208 if(tcp_close_state(sk,1)==1)
3209 {
3210 tcp_send_fin(sk);
3211 }
3212 }
3213 release_sock(sk);
3214 }
3215
3216
3217 /*
3218 * This routine takes stuff off of the write queue,
3219 * and puts it in the xmit queue. This happens as incoming acks
3220 * open up the remote window for us.
3221 */
3222
3223 static void tcp_write_xmit(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3224 {
3225 struct sk_buff *skb;
3226
3227 /*
3228 * The bytes will have to remain here. In time closedown will
3229 * empty the write queue and all will be happy
3230 */
3231
3232 if(sk->zapped)
3233 return;
3234
3235 /*
3236 * Anything on the transmit queue that fits the window can
3237 * be added providing we are not
3238 *
3239 * a) retransmitting (Nagle's rule)
3240 * b) exceeding our congestion window.
3241 */
3242
3243 while((skb = skb_peek(&sk->write_queue)) != NULL &&
3244 before(skb->h.seq, sk->window_seq + 1) &&
3245 (sk->retransmits == 0 ||
3246 sk->ip_xmit_timeout != TIME_WRITE ||
3247 before(skb->h.seq, sk->rcv_ack_seq + 1))
3248 && sk->packets_out < sk->cong_window)
3249 {
3250 IS_SKB(skb);
3251 skb_unlink(skb);
3252
3253 /*
3254 * See if we really need to send the packet.
3255 */
3256
3257 if (before(skb->h.seq, sk->rcv_ack_seq +1))
3258 {
3259 /*
3260 * This is acked data. We can discard it. This
3261 * cannot currently occur.
3262 */
3263
3264 sk->retransmits = 0;
3265 kfree_skb(skb, FREE_WRITE);
3266 if (!sk->dead)
3267 sk->write_space(sk);
3268 }
3269 else
3270 {
3271 struct tcphdr *th;
3272 struct iphdr *iph;
3273 int size;
3274 /*
3275 * put in the ack seq and window at this point rather than earlier,
3276 * in order to keep them monotonic. We really want to avoid taking
3277 * back window allocations. That's legal, but RFC1122 says it's frowned on.
3278 * Ack and window will in general have changed since this packet was put
3279 * on the write queue.
3280 */
3281 iph = skb->ip_hdr;
3282 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
3283 size = skb->len - (((unsigned char *) th) - skb->data);
3284
3285 th->ack_seq = ntohl(sk->acked_seq);
3286 th->window = ntohs(tcp_select_window(sk));
3287
3288 tcp_send_check(th, sk->saddr, sk->daddr, size, sk);
3289
3290 sk->sent_seq = skb->h.seq;
3291
3292 /*
3293 * IP manages our queue for some crazy reason
3294 */
3295
3296 sk->prot->queue_xmit(sk, skb->dev, skb, skb->free);
3297
3298 /*
3299 * Again we slide the timer wrongly
3300 */
3301
3302 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3303 }
3304 }
3305 }
3306
3307
3308 /*
3309 * This routine deals with incoming acks, but not outgoing ones.
3310 */
3311
3312 extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3313 {
3314 u32 ack;
3315 int flag = 0;
3316
3317 /*
3318 * 1 - there was data in packet as well as ack or new data is sent or
3319 * in shutdown state
3320 * 2 - data from retransmit queue was acked and removed
3321 * 4 - window shrunk or data from retransmit queue was acked and removed
3322 */
3323
3324 if(sk->zapped)
3325 return(1); /* Dead, cant ack any more so why bother */
3326
3327 /*
3328 * Have we discovered a larger window
3329 */
3330
3331 ack = ntohl(th->ack_seq);
3332
3333 if (ntohs(th->window) > sk->max_window)
3334 {
3335 sk->max_window = ntohs(th->window);
3336 #ifdef CONFIG_INET_PCTCP
3337 /* Hack because we don't send partial packets to non SWS
3338 handling hosts */
3339 sk->mss = min(sk->max_window>>1, sk->mtu);
3340 #else
3341 sk->mss = min(sk->max_window, sk->mtu);
3342 #endif
3343 }
3344
3345 /*
3346 * We have dropped back to keepalive timeouts. Thus we have
3347 * no retransmits pending.
3348 */
3349
3350 if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN)
3351 sk->retransmits = 0;
3352
3353 /*
3354 * If the ack is newer than sent or older than previous acks
3355 * then we can probably ignore it.
3356 */
3357
3358 if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq))
3359 {
3360 if(sk->debug)
3361 printk("Ack ignored %u %u\n",ack,sk->sent_seq);
3362
3363 /*
3364 * Keepalive processing.
3365 */
3366
3367 if (after(ack, sk->sent_seq))
3368 {
3369 return(0);
3370 }
3371
3372 /*
3373 * Restart the keepalive timer.
3374 */
3375
3376 if (sk->keepopen)
3377 {
3378 if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
3379 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3380 }
3381 return(1);
3382 }
3383
3384 /*
3385 * If there is data set flag 1
3386 */
3387
3388 if (len != th->doff*4)
3389 flag |= 1;
3390
3391 /*
3392 * See if our window has been shrunk.
3393 */
3394
3395 if (after(sk->window_seq, ack+ntohs(th->window)))
3396 {
3397 /*
3398 * We may need to move packets from the send queue
3399 * to the write queue, if the window has been shrunk on us.
3400 * The RFC says you are not allowed to shrink your window
3401 * like this, but if the other end does, you must be able
3402 * to deal with it.
3403 */
3404 struct sk_buff *skb;
3405 struct sk_buff *skb2;
3406 struct sk_buff *wskb = NULL;
3407
3408 skb2 = sk->send_head;
3409 sk->send_head = NULL;
3410 sk->send_tail = NULL;
3411
3412 /*
3413 * This is an artifact of a flawed concept. We want one
3414 * queue and a smarter send routine when we send all.
3415 */
3416
3417 flag |= 4; /* Window changed */
3418
3419 sk->window_seq = ack + ntohs(th->window);
3420 cli();
3421 while (skb2 != NULL)
3422 {
3423 skb = skb2;
3424 skb2 = skb->link3;
3425 skb->link3 = NULL;
3426 if (after(skb->h.seq, sk->window_seq))
3427 {
3428 if (sk->packets_out > 0)
3429 sk->packets_out--;
3430 /* We may need to remove this from the dev send list. */
3431 if (skb->next != NULL)
3432 {
3433 skb_unlink(skb);
3434 }
3435 /* Now add it to the write_queue. */
3436 if (wskb == NULL)
3437 skb_queue_head(&sk->write_queue,skb);
3438 else
3439 skb_append(wskb,skb);
3440 wskb = skb;
3441 }
3442 else
3443 {
3444 if (sk->send_head == NULL)
3445 {
3446 sk->send_head = skb;
3447 sk->send_tail = skb;
3448 }
3449 else
3450 {
3451 sk->send_tail->link3 = skb;
3452 sk->send_tail = skb;
3453 }
3454 skb->link3 = NULL;
3455 }
3456 }
3457 sti();
3458 }
3459
3460 /*
3461 * Pipe has emptied
3462 */
3463
3464 if (sk->send_tail == NULL || sk->send_head == NULL)
3465 {
3466 sk->send_head = NULL;
3467 sk->send_tail = NULL;
3468 sk->packets_out= 0;
3469 }
3470
3471 /*
3472 * Update the right hand window edge of the host
3473 */
3474
3475 sk->window_seq = ack + ntohs(th->window);
3476
3477 /*
3478 * We don't want too many packets out there.
3479 */
3480
3481 if (sk->ip_xmit_timeout == TIME_WRITE &&
3482 sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq))
3483 {
3484 /*
3485 * This is Jacobson's slow start and congestion avoidance.
3486 * SIGCOMM '88, p. 328. Because we keep cong_window in integral
3487 * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a
3488 * counter and increment it once every cwnd times. It's possible
3489 * that this should be done only if sk->retransmits == 0. I'm
3490 * interpreting "new data is acked" as including data that has
3491 * been retransmitted but is just now being acked.
3492 */
3493 if (sk->cong_window < sk->ssthresh)
3494 /*
3495 * In "safe" area, increase
3496 */
3497 sk->cong_window++;
3498 else
3499 {
3500 /*
3501 * In dangerous area, increase slowly. In theory this is
3502 * sk->cong_window += 1 / sk->cong_window
3503 */
3504 if (sk->cong_count >= sk->cong_window)
3505 {
3506 sk->cong_window++;
3507 sk->cong_count = 0;
3508 }
3509 else
3510 sk->cong_count++;
3511 }
3512 }
3513
3514 /*
3515 * Remember the highest ack received.
3516 */
3517
3518 sk->rcv_ack_seq = ack;
3519
3520 /*
3521 * If this ack opens up a zero window, clear backoff. It was
3522 * being used to time the probes, and is probably far higher than
3523 * it needs to be for normal retransmission.
3524 */
3525
3526 if (sk->ip_xmit_timeout == TIME_PROBE0)
3527 {
3528 sk->retransmits = 0; /* Our probe was answered */
3529
3530 /*
3531 * Was it a usable window open ?
3532 */
3533
3534 if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */
3535 ! before (sk->window_seq, sk->write_queue.next->h.seq))
3536 {
3537 sk->backoff = 0;
3538
3539 /*
3540 * Recompute rto from rtt. this eliminates any backoff.
3541 */
3542
3543 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3544 if (sk->rto > 120*HZ)
3545 sk->rto = 120*HZ;
3546 if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about
3547 .2 of a second because of BSD delayed acks - on a 100Mb/sec link
3548 .2 of a second is going to need huge windows (SIGH) */
3549 sk->rto = 20;
3550 }
3551 }
3552
3553 /*
3554 * See if we can take anything off of the retransmit queue.
3555 */
3556
3557 while(sk->send_head != NULL)
3558 {
3559 /* Check for a bug. */
3560 if (sk->send_head->link3 &&
3561 after(sk->send_head->h.seq, sk->send_head->link3->h.seq))
3562 printk("INET: tcp.c: *** bug send_list out of order.\n");
3563
3564 /*
3565 * If our packet is before the ack sequence we can
3566 * discard it as it's confirmed to have arrived the other end.
3567 */
3568
3569 if (before(sk->send_head->h.seq, ack+1))
3570 {
3571 struct sk_buff *oskb;
3572 if (sk->retransmits)
3573 {
3574 /*
3575 * We were retransmitting. don't count this in RTT est
3576 */
3577 flag |= 2;
3578
3579 /*
3580 * even though we've gotten an ack, we're still
3581 * retransmitting as long as we're sending from
3582 * the retransmit queue. Keeping retransmits non-zero
3583 * prevents us from getting new data interspersed with
3584 * retransmissions.
3585 */
3586
3587 if (sk->send_head->link3) /* Any more queued retransmits? */
3588 sk->retransmits = 1;
3589 else
3590 sk->retransmits = 0;
3591 }
3592 /*
3593 * Note that we only reset backoff and rto in the
3594 * rtt recomputation code. And that doesn't happen
3595 * if there were retransmissions in effect. So the
3596 * first new packet after the retransmissions is
3597 * sent with the backoff still in effect. Not until
3598 * we get an ack from a non-retransmitted packet do
3599 * we reset the backoff and rto. This allows us to deal
3600 * with a situation where the network delay has increased
3601 * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.)
3602 */
3603
3604 /*
3605 * We have one less packet out there.
3606 */
3607
3608 if (sk->packets_out > 0)
3609 sk->packets_out --;
3610 /*
3611 * Wake up the process, it can probably write more.
3612 */
3613 if (!sk->dead)
3614 sk->write_space(sk);
3615 oskb = sk->send_head;
3616
3617 if (!(flag&2)) /* Not retransmitting */
3618 {
3619 long m;
3620
3621 /*
3622 * The following amusing code comes from Jacobson's
3623 * article in SIGCOMM '88. Note that rtt and mdev
3624 * are scaled versions of rtt and mean deviation.
3625 * This is designed to be as fast as possible
3626 * m stands for "measurement".
3627 */
3628
3629 m = jiffies - oskb->when; /* RTT */
3630 if(m<=0)
3631 m=1; /* IS THIS RIGHT FOR <0 ??? */
3632 m -= (sk->rtt >> 3); /* m is now error in rtt est */
3633 sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */
3634 if (m < 0)
3635 m = -m; /* m is now abs(error) */
3636 m -= (sk->mdev >> 2); /* similar update on mdev */
3637 sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
3638
3639 /*
3640 * Now update timeout. Note that this removes any backoff.
3641 */
3642
3643 sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1;
3644 if (sk->rto > 120*HZ)
3645 sk->rto = 120*HZ;
3646 if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
3647 sk->rto = 20;
3648 sk->backoff = 0;
3649 }
3650 flag |= (2|4); /* 2 is really more like 'don't adjust the rtt
3651 In this case as we just set it up */
3652 cli();
3653 oskb = sk->send_head;
3654 IS_SKB(oskb);
3655 sk->send_head = oskb->link3;
3656 if (sk->send_head == NULL)
3657 {
3658 sk->send_tail = NULL;
3659 }
3660
3661 /*
3662 * We may need to remove this from the dev send list.
3663 */
3664
3665 if (oskb->next)
3666 skb_unlink(oskb);
3667 sti();
3668 kfree_skb(oskb, FREE_WRITE); /* write. */
3669 if (!sk->dead)
3670 sk->write_space(sk);
3671 }
3672 else
3673 {
3674 break;
3675 }
3676 }
3677
3678 /*
3679 * XXX someone ought to look at this too.. at the moment, if skb_peek()
3680 * returns non-NULL, we complete ignore the timer stuff in the else
3681 * clause. We ought to organize the code so that else clause can
3682 * (should) be executed regardless, possibly moving the PROBE timer
3683 * reset over. The skb_peek() thing should only move stuff to the
3684 * write queue, NOT also manage the timer functions.
3685 */
3686
3687 /*
3688 * Maybe we can take some stuff off of the write queue,
3689 * and put it onto the xmit queue.
3690 */
3691 if (skb_peek(&sk->write_queue) != NULL)
3692 {
3693 if (after (sk->window_seq+1, sk->write_queue.next->h.seq) &&
3694 (sk->retransmits == 0 ||
3695 sk->ip_xmit_timeout != TIME_WRITE ||
3696 before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1))
3697 && sk->packets_out < sk->cong_window)
3698 {
3699 /*
3700 * Add more data to the send queue.
3701 */
3702 flag |= 1;
3703 tcp_write_xmit(sk);
3704 }
3705 else if (before(sk->window_seq, sk->write_queue.next->h.seq) &&
3706 sk->send_head == NULL &&
3707 sk->ack_backlog == 0 &&
3708 sk->state != TCP_TIME_WAIT)
3709 {
3710 /*
3711 * Data to queue but no room.
3712 */
3713 reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
3714 }
3715 }
3716 else
3717 {
3718 /*
3719 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
3720 * from TCP_CLOSE we don't do anything
3721 *
3722 * from anything else, if there is write data (or fin) pending,
3723 * we use a TIME_WRITE timeout, else if keepalive we reset to
3724 * a KEEPALIVE timeout, else we delete the timer.
3725 *
3726 * We do not set flag for nominal write data, otherwise we may
3727 * force a state where we start to write itsy bitsy tidbits
3728 * of data.
3729 */
3730
3731 switch(sk->state) {
3732 case TCP_TIME_WAIT:
3733 /*
3734 * keep us in TIME_WAIT until we stop getting packets,
3735 * reset the timeout.
3736 */
3737 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3738 break;
3739 case TCP_CLOSE:
3740 /*
3741 * don't touch the timer.
3742 */
3743 break;
3744 default:
3745 /*
3746 * Must check send_head, write_queue, and ack_backlog
3747 * to determine which timeout to use.
3748 */
3749 if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) {
3750 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3751 } else if (sk->keepopen) {
3752 reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
3753 } else {
3754 del_timer(&sk->retransmit_timer);
3755 sk->ip_xmit_timeout = 0;
3756 }
3757 break;
3758 }
3759 }
3760
3761 /*
3762 * We have nothing queued but space to send. Send any partial
3763 * packets immediately (end of Nagle rule application).
3764 */
3765
3766 if (sk->packets_out == 0 && sk->partial != NULL &&
3767 skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL)
3768 {
3769 flag |= 1;
3770 tcp_send_partial(sk);
3771 }
3772
3773 /*
3774 * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
3775 * we are now waiting for an acknowledge to our FIN. The other end is
3776 * already in TIME_WAIT.
3777 *
3778 * Move to TCP_CLOSE on success.
3779 */
3780
3781 if (sk->state == TCP_LAST_ACK)
3782 {
3783 if (!sk->dead)
3784 sk->state_change(sk);
3785 if(sk->debug)
3786 printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
3787 sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
3788 if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/)
3789 {
3790 flag |= 1;
3791 tcp_set_state(sk,TCP_CLOSE);
3792 sk->shutdown = SHUTDOWN_MASK;
3793 }
3794 }
3795
3796 /*
3797 * Incoming ACK to a FIN we sent in the case of our initiating the close.
3798 *
3799 * Move to FIN_WAIT2 to await a FIN from the other end. Set
3800 * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
3801 */
3802
3803 if (sk->state == TCP_FIN_WAIT1)
3804 {
3805
3806 if (!sk->dead)
3807 sk->state_change(sk);
3808 if (sk->rcv_ack_seq == sk->write_seq)
3809 {
3810 flag |= 1;
3811 sk->shutdown |= SEND_SHUTDOWN;
3812 tcp_set_state(sk, TCP_FIN_WAIT2);
3813 }
3814 }
3815
3816 /*
3817 * Incoming ACK to a FIN we sent in the case of a simultaneous close.
3818 *
3819 * Move to TIME_WAIT
3820 */
3821
3822 if (sk->state == TCP_CLOSING)
3823 {
3824
3825 if (!sk->dead)
3826 sk->state_change(sk);
3827 if (sk->rcv_ack_seq == sk->write_seq)
3828 {
3829 flag |= 1;
3830 tcp_time_wait(sk);
3831 }
3832 }
3833
3834 /*
3835 * Final ack of a three way shake
3836 */
3837
3838 if(sk->state==TCP_SYN_RECV)
3839 {
3840 tcp_set_state(sk, TCP_ESTABLISHED);
3841 tcp_options(sk,th);
3842 sk->dummy_th.dest=th->source;
3843 sk->copied_seq = sk->acked_seq;
3844 if(!sk->dead)
3845 sk->state_change(sk);
3846 if(sk->max_window==0)
3847 {
3848 sk->max_window=32; /* Sanity check */
3849 sk->mss=min(sk->max_window,sk->mtu);
3850 }
3851 }
3852
3853 /*
3854 * I make no guarantees about the first clause in the following
3855 * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under
3856 * what conditions "!flag" would be true. However I think the rest
3857 * of the conditions would prevent that from causing any
3858 * unnecessary retransmission.
3859 * Clearly if the first packet has expired it should be
3860 * retransmitted. The other alternative, "flag&2 && retransmits", is
3861 * harder to explain: You have to look carefully at how and when the
3862 * timer is set and with what timeout. The most recent transmission always
3863 * sets the timer. So in general if the most recent thing has timed
3864 * out, everything before it has as well. So we want to go ahead and
3865 * retransmit some more. If we didn't explicitly test for this
3866 * condition with "flag&2 && retransmits", chances are "when + rto < jiffies"
3867 * would not be true. If you look at the pattern of timing, you can
3868 * show that rto is increased fast enough that the next packet would
3869 * almost never be retransmitted immediately. Then you'd end up
3870 * waiting for a timeout to send each packet on the retransmission
3871 * queue. With my implementation of the Karn sampling algorithm,
3872 * the timeout would double each time. The net result is that it would
3873 * take a hideous amount of time to recover from a single dropped packet.
3874 * It's possible that there should also be a test for TIME_WRITE, but
3875 * I think as long as "send_head != NULL" and "retransmit" is on, we've
3876 * got to be in real retransmission mode.
3877 * Note that tcp_do_retransmit is called with all==1. Setting cong_window
3878 * back to 1 at the timeout will cause us to send 1, then 2, etc. packets.
3879 * As long as no further losses occur, this seems reasonable.
3880 */
3881
3882 if (((!flag) || (flag&4)) && sk->send_head != NULL &&
3883 (((flag&2) && sk->retransmits) ||
3884 (sk->send_head->when + sk->rto < jiffies)))
3885 {
3886 if(sk->send_head->when + sk->rto < jiffies)
3887 tcp_retransmit(sk,0);
3888 else
3889 {
3890 tcp_do_retransmit(sk, 1);
3891 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3892 }
3893 }
3894
3895 return(1);
3896 }
3897
3898
3899 /*
3900 * Process the FIN bit. This now behaves as it is supposed to work
3901 * and the FIN takes effect when it is validly part of sequence
3902 * space. Not before when we get holes.
3903 *
3904 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3905 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
3906 * TIME-WAIT)
3907 *
3908 * If we are in FINWAIT-1, a received FIN indicates simultaneous
3909 * close and we go into CLOSING (and later onto TIME-WAIT)
3910 *
3911 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3912 *
3913 */
3914
3915 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
3916 {
3917 sk->fin_seq = th->seq + skb->len + th->syn + th->fin;
3918
3919 if (!sk->dead)
3920 {
3921 sk->state_change(sk);
3922 sock_wake_async(sk->socket, 1);
3923 }
3924
3925 switch(sk->state)
3926 {
3927 case TCP_SYN_RECV:
3928 case TCP_SYN_SENT:
3929 case TCP_ESTABLISHED:
3930 /*
3931 * move to CLOSE_WAIT, tcp_data() already handled
3932 * sending the ack.
3933 */
3934 tcp_set_state(sk,TCP_CLOSE_WAIT);
3935 if (th->rst)
3936 sk->shutdown = SHUTDOWN_MASK;
3937 break;
3938
3939 case TCP_CLOSE_WAIT:
3940 case TCP_CLOSING:
3941 /*
3942 * received a retransmission of the FIN, do
3943 * nothing.
3944 */
3945 break;
3946 case TCP_TIME_WAIT:
3947 /*
3948 * received a retransmission of the FIN,
3949 * restart the TIME_WAIT timer.
3950 */
3951 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3952 return(0);
3953 case TCP_FIN_WAIT1:
3954 /*
3955 * This case occurs when a simultaneous close
3956 * happens, we must ack the received FIN and
3957 * enter the CLOSING state.
3958 *
3959 * This causes a WRITE timeout, which will either
3960 * move on to TIME_WAIT when we timeout, or resend
3961 * the FIN properly (maybe we get rid of that annoying
3962 * FIN lost hang). The TIME_WRITE code is already correct
3963 * for handling this timeout.
3964 */
3965
3966 if(sk->ip_xmit_timeout != TIME_WRITE)
3967 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
3968 tcp_set_state(sk,TCP_CLOSING);
3969 break;
3970 case TCP_FIN_WAIT2:
3971 /*
3972 * received a FIN -- send ACK and enter TIME_WAIT
3973 */
3974 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3975 sk->shutdown|=SHUTDOWN_MASK;
3976 tcp_set_state(sk,TCP_TIME_WAIT);
3977 break;
3978 case TCP_CLOSE:
3979 /*
3980 * already in CLOSE
3981 */
3982 break;
3983 default:
3984 tcp_set_state(sk,TCP_LAST_ACK);
3985
3986 /* Start the timers. */
3987 reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
3988 return(0);
3989 }
3990
3991 return(0);
3992 }
3993
3994
3995
3996 /*
3997 * This routine handles the data. If there is room in the buffer,
3998 * it will be have already been moved into it. If there is no
3999 * room, then we will just have to discard the packet.
4000 */
4001
4002 extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4003 unsigned long saddr, unsigned short len)
4004 {
4005 struct sk_buff *skb1, *skb2;
4006 struct tcphdr *th;
4007 int dup_dumped=0;
4008 u32 new_seq, shut_seq;
4009
4010 th = skb->h.th;
4011 skb_pull(skb,th->doff*4);
4012 skb_trim(skb,len-(th->doff*4));
4013
4014 /*
4015 * The bytes in the receive read/assembly queue has increased. Needed for the
4016 * low memory discard algorithm
4017 */
4018
4019 sk->bytes_rcv += skb->len;
4020
4021 if (skb->len == 0 && !th->fin)
4022 {
4023 /*
4024 * Don't want to keep passing ack's back and forth.
4025 * (someone sent us dataless, boring frame)
4026 */
4027 if (!th->ack)
4028 tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr);
4029 kfree_skb(skb, FREE_READ);
4030 return(0);
4031 }
4032
4033 /*
4034 * We no longer have anyone receiving data on this connection.
4035 */
4036
4037 #ifndef TCP_DONT_RST_SHUTDOWN
4038
4039 if(sk->shutdown & RCV_SHUTDOWN)
4040 {
4041 /*
4042 * FIXME: BSD has some magic to avoid sending resets to
4043 * broken 4.2 BSD keepalives. Much to my surprise a few non
4044 * BSD stacks still have broken keepalives so we want to
4045 * cope with it.
4046 */
4047
4048 if(skb->len) /* We don't care if it's just an ack or
4049 a keepalive/window probe */
4050 {
4051 new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */
4052
4053 /* Do this the way 4.4BSD treats it. Not what I'd
4054 regard as the meaning of the spec but it's what BSD
4055 does and clearly they know everything 8) */
4056
4057 /*
4058 * This is valid because of two things
4059 *
4060 * a) The way tcp_data behaves at the bottom.
4061 * b) A fin takes effect when read not when received.
4062 */
4063
4064 shut_seq=sk->acked_seq+1; /* Last byte */
4065
4066 if(after(new_seq,shut_seq))
4067 {
4068 if(sk->debug)
4069 printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
4070 sk, new_seq, shut_seq, sk->blog);
4071 if(sk->dead)
4072 {
4073 sk->acked_seq = new_seq + th->fin;
4074 tcp_reset(sk->saddr, sk->daddr, skb->h.th,
4075 sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
4076 tcp_statistics.TcpEstabResets++;
4077 tcp_set_state(sk,TCP_CLOSE);
4078 sk->err = EPIPE;
4079 sk->shutdown = SHUTDOWN_MASK;
4080 kfree_skb(skb, FREE_READ);
4081 return 0;
4082 }
4083 }
4084 }
4085 }
4086
4087 #endif
4088
4089 /*
4090 * Now we have to walk the chain, and figure out where this one
4091 * goes into it. This is set up so that the last packet we received
4092 * will be the first one we look at, that way if everything comes
4093 * in order, there will be no performance loss, and if they come
4094 * out of order we will be able to fit things in nicely.
4095 *
4096 * [AC: This is wrong. We should assume in order first and then walk
4097 * forwards from the first hole based upon real traffic patterns.]
4098 *
4099 */
4100
4101 if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */
4102 {
4103 skb_queue_head(&sk->receive_queue,skb);
4104 skb1= NULL;
4105 }
4106 else
4107 {
4108 for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev)
4109 {
4110 if(sk->debug)
4111 {
4112 printk("skb1=%p :", skb1);
4113 printk("skb1->h.th->seq = %d: ", skb1->h.th->seq);
4114 printk("skb->h.th->seq = %d\n",skb->h.th->seq);
4115 printk("copied_seq = %d acked_seq = %d\n", sk->copied_seq,
4116 sk->acked_seq);
4117 }
4118
4119 /*
4120 * Optimisation: Duplicate frame or extension of previous frame from
4121 * same sequence point (lost ack case).
4122 * The frame contains duplicate data or replaces a previous frame
4123 * discard the previous frame (safe as sk->inuse is set) and put
4124 * the new one in its place.
4125 */
4126
4127 if (th->seq==skb1->h.th->seq && skb->len>= skb1->len)
4128 {
4129 skb_append(skb1,skb);
4130 skb_unlink(skb1);
4131 kfree_skb(skb1,FREE_READ);
4132 dup_dumped=1;
4133 skb1=NULL;
4134 break;
4135 }
4136
4137 /*
4138 * Found where it fits
4139 */
4140
4141 if (after(th->seq+1, skb1->h.th->seq))
4142 {
4143 skb_append(skb1,skb);
4144 break;
4145 }
4146
4147 /*
4148 * See if we've hit the start. If so insert.
4149 */
4150 if (skb1 == skb_peek(&sk->receive_queue))
4151 {
4152 skb_queue_head(&sk->receive_queue, skb);
4153 break;
4154 }
4155 }
4156 }
4157
4158 /*
4159 * Figure out what the ack value for this frame is
4160 */
4161
4162 th->ack_seq = th->seq + skb->len;
4163 if (th->syn)
4164 th->ack_seq++;
4165 if (th->fin)
4166 th->ack_seq++;
4167
4168 if (before(sk->acked_seq, sk->copied_seq))
4169 {
4170 printk("*** tcp.c:tcp_data bug acked < copied\n");
4171 sk->acked_seq = sk->copied_seq;
4172 }
4173
4174 /*
4175 * Now figure out if we can ack anything. This is very messy because we really want two
4176 * receive queues, a completed and an assembly queue. We also want only one transmit
4177 * queue.
4178 */
4179
4180 if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1))
4181 {
4182 if (before(th->seq, sk->acked_seq+1))
4183 {
4184 int newwindow;
4185
4186 if (after(th->ack_seq, sk->acked_seq))
4187 {
4188 newwindow = sk->window-(th->ack_seq - sk->acked_seq);
4189 if (newwindow < 0)
4190 newwindow = 0;
4191 sk->window = newwindow;
4192 sk->acked_seq = th->ack_seq;
4193 }
4194 skb->acked = 1;
4195
4196 /*
4197 * When we ack the fin, we do the FIN
4198 * processing.
4199 */
4200
4201 if (skb->h.th->fin)
4202 {
4203 tcp_fin(skb,sk,skb->h.th);
4204 }
4205
4206 for(skb2 = skb->next;
4207 skb2 != (struct sk_buff *)&sk->receive_queue;
4208 skb2 = skb2->next)
4209 {
4210 if (before(skb2->h.th->seq, sk->acked_seq+1))
4211 {
4212 if (after(skb2->h.th->ack_seq, sk->acked_seq))
4213 {
4214 newwindow = sk->window -
4215 (skb2->h.th->ack_seq - sk->acked_seq);
4216 if (newwindow < 0)
4217 newwindow = 0;
4218 sk->window = newwindow;
4219 sk->acked_seq = skb2->h.th->ack_seq;
4220 }
4221 skb2->acked = 1;
4222 /*
4223 * When we ack the fin, we do
4224 * the fin handling.
4225 */
4226 if (skb2->h.th->fin)
4227 {
4228 tcp_fin(skb,sk,skb->h.th);
4229 }
4230
4231 /*
4232 * Force an immediate ack.
4233 */
4234
4235 sk->ack_backlog = sk->max_ack_backlog;
4236 }
4237 else
4238 {
4239 break;
4240 }
4241 }
4242
4243 /*
4244 * This also takes care of updating the window.
4245 * This if statement needs to be simplified.
4246 */
4247 if (!sk->delay_acks ||
4248 sk->ack_backlog >= sk->max_ack_backlog ||
4249 sk->bytes_rcv > sk->max_unacked || th->fin) {
4250 /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */
4251 }
4252 else
4253 {
4254 sk->ack_backlog++;
4255 if(sk->debug)
4256 printk("Ack queued.\n");
4257 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4258 }
4259 }
4260 }
4261
4262 /*
4263 * If we've missed a packet, send an ack.
4264 * Also start a timer to send another.
4265 */
4266
4267 if (!skb->acked)
4268 {
4269
4270 /*
4271 * This is important. If we don't have much room left,
4272 * we need to throw out a few packets so we have a good
4273 * window. Note that mtu is used, not mss, because mss is really
4274 * for the send side. He could be sending us stuff as large as mtu.
4275 */
4276
4277 while (sock_rspace(sk) < sk->mtu)
4278 {
4279 skb1 = skb_peek(&sk->receive_queue);
4280 if (skb1 == NULL)
4281 {
4282 printk("INET: tcp.c:tcp_data memory leak detected.\n");
4283 break;
4284 }
4285
4286 /*
4287 * Don't throw out something that has been acked.
4288 */
4289
4290 if (skb1->acked)
4291 {
4292 break;
4293 }
4294
4295 skb_unlink(skb1);
4296 kfree_skb(skb1, FREE_READ);
4297 }
4298 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4299 sk->ack_backlog++;
4300 reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
4301 }
4302 else
4303 {
4304 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4305 }
4306
4307 /*
4308 * Now tell the user we may have some data.
4309 */
4310
4311 if (!sk->dead)
4312 {
4313 if(sk->debug)
4314 printk("Data wakeup.\n");
4315 sk->data_ready(sk,0);
4316 }
4317 return(0);
4318 }
4319
4320
4321 /*
4322 * This routine is only called when we have urgent data
4323 * signalled. Its the 'slow' part of tcp_urg. It could be
4324 * moved inline now as tcp_urg is only called from one
4325 * place. We handle URGent data wrong. We have to - as
4326 * BSD still doesn't use the correction from RFC961.
4327 */
4328
4329 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4330 {
4331 u32 ptr = ntohs(th->urg_ptr);
4332
4333 if (ptr)
4334 ptr--;
4335 ptr += th->seq;
4336
4337 /* ignore urgent data that we've already seen and read */
4338 if (after(sk->copied_seq, ptr))
4339 return;
4340
4341 /* do we already have a newer (or duplicate) urgent pointer? */
4342 if (sk->urg_data && !after(ptr, sk->urg_seq))
4343 return;
4344
4345 /* tell the world about our new urgent pointer */
4346 if (sk->proc != 0) {
4347 if (sk->proc > 0) {
4348 kill_proc(sk->proc, SIGURG, 1);
4349 } else {
4350 kill_pg(-sk->proc, SIGURG, 1);
4351 }
4352 }
4353 sk->urg_data = URG_NOTYET;
4354 sk->urg_seq = ptr;
4355 }
4356
4357 /*
4358 * This is the 'fast' part of urgent handling.
4359 */
4360
4361 extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4362 unsigned long saddr, unsigned long len)
4363 {
4364 u32 ptr;
4365
4366 /*
4367 * Check if we get a new urgent pointer - normally not
4368 */
4369
4370 if (th->urg)
4371 tcp_check_urg(sk,th);
4372
4373 /*
4374 * Do we wait for any urgent data? - normally not
4375 */
4376
4377 if (sk->urg_data != URG_NOTYET)
4378 return 0;
4379
4380 /*
4381 * Is the urgent pointer pointing into this packet?
4382 */
4383
4384 ptr = sk->urg_seq - th->seq + th->doff*4;
4385 if (ptr >= len)
4386 return 0;
4387
4388 /*
4389 * Ok, got the correct packet, update info
4390 */
4391
4392 sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
4393 if (!sk->dead)
4394 sk->data_ready(sk,0);
4395 return 0;
4396 }
4397
4398 /*
4399 * This will accept the next outstanding connection.
4400 */
4401
4402 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4403 {
4404 struct sock *newsk;
4405 struct sk_buff *skb;
4406
4407 /*
4408 * We need to make sure that this socket is listening,
4409 * and that it has something pending.
4410 */
4411
4412 if (sk->state != TCP_LISTEN)
4413 {
4414 sk->err = EINVAL;
4415 return(NULL);
4416 }
4417
4418 /* Avoid the race. */
4419 cli();
4420 sk->inuse = 1;
4421
4422 while((skb = tcp_dequeue_established(sk)) == NULL)
4423 {
4424 if (flags & O_NONBLOCK)
4425 {
4426 sti();
4427 release_sock(sk);
4428 sk->err = EAGAIN;
4429 return(NULL);
4430 }
4431
4432 release_sock(sk);
4433 interruptible_sleep_on(sk->sleep);
4434 if (current->signal & ~current->blocked)
4435 {
4436 sti();
4437 sk->err = ERESTARTSYS;
4438 return(NULL);
4439 }
4440 sk->inuse = 1;
4441 }
4442 sti();
4443
4444 /*
4445 * Now all we need to do is return skb->sk.
4446 */
4447
4448 newsk = skb->sk;
4449
4450 kfree_skb(skb, FREE_READ);
4451 sk->ack_backlog--;
4452 release_sock(sk);
4453 return(newsk);
4454 }
4455
4456
4457 /*
4458 * This will initiate an outgoing connection.
4459 */
4460
4461 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4462 {
4463 struct sk_buff *buff;
4464 struct device *dev=NULL;
4465 unsigned char *ptr;
4466 int tmp;
4467 int atype;
4468 struct tcphdr *t1;
4469 struct rtable *rt;
4470
4471 if (sk->state != TCP_CLOSE)
4472 return(-EISCONN);
4473
4474 /*
4475 * Don't allow a double connect.
4476 */
4477
4478 if(sk->daddr)
4479 return -EINVAL;
4480
4481 if (addr_len < 8)
4482 return(-EINVAL);
4483
4484 if (usin->sin_family && usin->sin_family != AF_INET)
4485 return(-EAFNOSUPPORT);
4486
4487 /*
4488 * connect() to INADDR_ANY means loopback (BSD'ism).
4489 */
4490
4491 if(usin->sin_addr.s_addr==INADDR_ANY)
4492 usin->sin_addr.s_addr=ip_my_addr();
4493
4494 /*
4495 * Don't want a TCP connection going to a broadcast address
4496 */
4497
4498 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
4499 return -ENETUNREACH;
4500
4501 sk->inuse = 1;
4502 sk->daddr = usin->sin_addr.s_addr;
4503 sk->write_seq = tcp_init_seq();
4504 sk->window_seq = sk->write_seq;
4505 sk->rcv_ack_seq = sk->write_seq -1;
4506 sk->err = 0;
4507 sk->dummy_th.dest = usin->sin_port;
4508 release_sock(sk);
4509
4510 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
4511 if (buff == NULL)
4512 {
4513 return(-ENOMEM);
4514 }
4515 sk->inuse = 1;
4516 buff->sk = sk;
4517 buff->free = 0;
4518 buff->localroute = sk->localroute;
4519
4520
4521 /*
4522 * Put in the IP header and routing stuff.
4523 */
4524
4525 if (sk->localroute)
4526 rt=ip_rt_local(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4527 else
4528 rt=ip_rt_route(sk->daddr, NULL, sk->saddr ? NULL : &sk->saddr);
4529
4530 /*
4531 * When we connect we enforce receive requirements too.
4532 */
4533
4534 sk->rcv_saddr=sk->saddr;
4535
4536 /*
4537 * We need to build the routing stuff from the things saved in skb.
4538 */
4539
4540 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
4541 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
4542 if (tmp < 0)
4543 {
4544 sock_wfree(sk, buff);
4545 release_sock(sk);
4546 return(-ENETUNREACH);
4547 }
4548
4549 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
4550
4551 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
4552 t1->seq = ntohl(sk->write_seq++);
4553 sk->sent_seq = sk->write_seq;
4554 buff->h.seq = sk->write_seq;
4555 t1->ack = 0;
4556 t1->window = 2;
4557 t1->res1=0;
4558 t1->res2=0;
4559 t1->rst = 0;
4560 t1->urg = 0;
4561 t1->psh = 0;
4562 t1->syn = 1;
4563 t1->urg_ptr = 0;
4564 t1->doff = 6;
4565 /* use 512 or whatever user asked for */
4566
4567 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
4568 sk->window_clamp=rt->rt_window;
4569 else
4570 sk->window_clamp=0;
4571
4572 if (sk->user_mss)
4573 sk->mtu = sk->user_mss;
4574 else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
4575 sk->mtu = rt->rt_mss;
4576 else
4577 {
4578 #ifdef CONFIG_INET_SNARL
4579 if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
4580 #else
4581 if ((sk->saddr ^ sk->daddr) & dev->pa_mask)
4582 #endif
4583 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
4584 else
4585 sk->mtu = MAX_WINDOW;
4586 }
4587 /*
4588 * but not bigger than device MTU
4589 */
4590
4591 if(sk->mtu <32)
4592 sk->mtu = 32; /* Sanity limit */
4593
4594 sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
4595
4596 /*
4597 * Put in the TCP options to say MTU.
4598 */
4599
4600 ptr = skb_put(buff,4);
4601 ptr[0] = 2;
4602 ptr[1] = 4;
4603 ptr[2] = (sk->mtu) >> 8;
4604 ptr[3] = (sk->mtu) & 0xff;
4605 tcp_send_check(t1, sk->saddr, sk->daddr,
4606 sizeof(struct tcphdr) + 4, sk);
4607
4608 /*
4609 * This must go first otherwise a really quick response will get reset.
4610 */
4611
4612 tcp_cache_zap();
4613 tcp_set_state(sk,TCP_SYN_SENT);
4614 if(rt&&rt->rt_flags&RTF_IRTT)
4615 sk->rto = rt->rt_irtt;
4616 else
4617 sk->rto = TCP_TIMEOUT_INIT;
4618 sk->retransmit_timer.function=&retransmit_timer;
4619 sk->retransmit_timer.data = (unsigned long)sk;
4620 reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
4621 sk->retransmits = 0; /* Now works the right way instead of a hacked
4622 initial setting */
4623
4624 sk->prot->queue_xmit(sk, dev, buff, 0);
4625 reset_xmit_timer(sk, TIME_WRITE, sk->rto);
4626 tcp_statistics.TcpActiveOpens++;
4627 tcp_statistics.TcpOutSegs++;
4628
4629 release_sock(sk);
4630 return(0);
4631 }
4632
4633
4634 /*
4635 * This functions checks to see if the tcp header is actually acceptable.
4636 */
4637
4638 extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4639 struct options *opt, unsigned long saddr, struct device *dev)
4640 {
4641 u32 next_seq;
4642
4643 next_seq = len - 4*th->doff;
4644 if (th->fin)
4645 next_seq++;
4646 /* if we have a zero window, we can't have any data in the packet.. */
4647 if (next_seq && !sk->window)
4648 goto ignore_it;
4649 next_seq += th->seq;
4650
4651 /*
4652 * This isn't quite right. sk->acked_seq could be more recent
4653 * than sk->window. This is however close enough. We will accept
4654 * slightly more packets than we should, but it should not cause
4655 * problems unless someone is trying to forge packets.
4656 */
4657
4658 /* have we already seen all of this packet? */
4659 if (!after(next_seq+1, sk->acked_seq))
4660 goto ignore_it;
4661 /* or does it start beyond the window? */
4662 if (!before(th->seq, sk->acked_seq + sk->window + 1))
4663 goto ignore_it;
4664
4665 /* ok, at least part of this packet would seem interesting.. */
4666 return 1;
4667
4668 ignore_it:
4669 if (th->rst)
4670 return 0;
4671
4672 /*
4673 * Send a reset if we get something not ours and we are
4674 * unsynchronized. Note: We don't do anything to our end. We
4675 * are just killing the bogus remote connection then we will
4676 * connect again and it will work (with luck).
4677 */
4678
4679 if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV)
4680 {
4681 tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
4682 return 1;
4683 }
4684
4685 /* Try to resync things. */
4686 tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr);
4687 return 0;
4688 }
4689
4690 /*
4691 * When we get a reset we do this.
4692 */
4693
4694 static int tcp_std_reset(struct sock *sk, struct sk_buff *skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4695 {
4696 sk->zapped = 1;
4697 sk->err = ECONNRESET;
4698 if (sk->state == TCP_SYN_SENT)
4699 sk->err = ECONNREFUSED;
4700 if (sk->state == TCP_CLOSE_WAIT)
4701 sk->err = EPIPE;
4702 #ifdef TCP_DO_RFC1337
4703 /*
4704 * Time wait assassination protection [RFC1337]
4705 */
4706 if(sk->state!=TCP_TIME_WAIT)
4707 {
4708 tcp_set_state(sk,TCP_CLOSE);
4709 sk->shutdown = SHUTDOWN_MASK;
4710 }
4711 #else
4712 tcp_set_state(sk,TCP_CLOSE);
4713 sk->shutdown = SHUTDOWN_MASK;
4714 #endif
4715 if (!sk->dead)
4716 sk->state_change(sk);
4717 kfree_skb(skb, FREE_READ);
4718 release_sock(sk);
4719 return(0);
4720 }
4721
4722 /*
4723 * A TCP packet has arrived.
4724 * skb->h.raw is the TCP header.
4725 */
4726
4727 int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
4728 __u32 daddr, unsigned short len,
4729 __u32 saddr, int redo, struct inet_protocol * protocol)
4730 {
4731 struct tcphdr *th;
4732 struct sock *sk;
4733 int syn_ok=0;
4734
4735 tcp_statistics.TcpInSegs++;
4736 if(skb->pkt_type!=PACKET_HOST)
4737 {
4738 kfree_skb(skb,FREE_READ);
4739 return(0);
4740 }
4741
4742 th = skb->h.th;
4743
4744 /*
4745 * Find the socket, using the last hit cache if applicable.
4746 */
4747
4748 if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport)
4749 {
4750 sk=(struct sock *)th_cache_sk;
4751 /*
4752 * We think this is causing the bug so
4753 */
4754 if(sk!=get_sock(&tcp_prot,th->dest, saddr, th->source, daddr))
4755 printk("Cache mismatch on TCP.\n");
4756 }
4757 else
4758 {
4759 sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
4760 th_cache_saddr=saddr;
4761 th_cache_daddr=daddr;
4762 th_cache_dport=th->dest;
4763 th_cache_sport=th->source;
4764 th_cache_sk=sk;
4765 }
4766
4767 /*
4768 * If this socket has got a reset it's to all intents and purposes
4769 * really dead. Count closed sockets as dead.
4770 *
4771 * Note: BSD appears to have a bug here. A 'closed' TCP in BSD
4772 * simply drops data. This seems incorrect as a 'closed' TCP doesn't
4773 * exist so should cause resets as if the port was unreachable.
4774 */
4775
4776 if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))
4777 sk=NULL;
4778
4779 if (!redo)
4780 {
4781 /*
4782 * Pull up the IP header.
4783 */
4784 skb_pull(skb, skb->h.raw-skb->data);
4785 /*
4786 * Try to use the device checksum if provided.
4787 */
4788 if (
4789 (skb->ip_summed && tcp_check(th, len, saddr, daddr, skb->csum ))||
4790 (!skb->ip_summed && tcp_check(th, len, saddr, daddr, csum_partial((char *)th, len, 0)))
4791 )
4792 {
4793 skb->sk = NULL;
4794 kfree_skb(skb,FREE_READ);
4795 /*
4796 * We don't release the socket because it was
4797 * never marked in use.
4798 */
4799 return(0);
4800 }
4801 th->seq = ntohl(th->seq);
4802
4803 /* See if we know about the socket. */
4804 if (sk == NULL)
4805 {
4806 /*
4807 * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)
4808 */
4809 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4810 skb->sk = NULL;
4811 /*
4812 * Discard frame
4813 */
4814 kfree_skb(skb, FREE_READ);
4815 return(0);
4816 }
4817
4818 skb->acked = 0;
4819 skb->used = 0;
4820 skb->free = 0;
4821 skb->saddr = daddr;
4822 skb->daddr = saddr;
4823
4824 /* We may need to add it to the backlog here. */
4825 cli();
4826 if (sk->inuse)
4827 {
4828 skb_queue_tail(&sk->back_log, skb);
4829 sti();
4830 return(0);
4831 }
4832 sk->inuse = 1;
4833 sti();
4834 }
4835 else
4836 {
4837 if (sk==NULL)
4838 {
4839 tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
4840 skb->sk = NULL;
4841 kfree_skb(skb, FREE_READ);
4842 return(0);
4843 }
4844 }
4845
4846
4847 if (!sk->prot)
4848 {
4849 printk("IMPOSSIBLE 3\n");
4850 return(0);
4851 }
4852
4853
4854 /*
4855 * Charge the memory to the socket.
4856 */
4857
4858 skb->sk=sk;
4859 sk->rmem_alloc += skb->truesize;
4860
4861 /*
4862 * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
4863 * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
4864 * compatibility. We also set up variables more thoroughly [Karn notes in the
4865 * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
4866 */
4867
4868 if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */
4869 {
4870
4871 /*
4872 * Now deal with unusual cases.
4873 */
4874
4875 if(sk->state==TCP_LISTEN)
4876 {
4877 if(th->ack) /* These use the socket TOS.. might want to be the received TOS */
4878 tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
4879
4880 /*
4881 * We don't care for RST, and non SYN are absorbed (old segments)
4882 * Broadcast/multicast SYN isn't allowed. Note - bug if you change the
4883 * netmask on a running connection it can go broadcast. Even Sun's have
4884 * this problem so I'm ignoring it
4885 */
4886
4887 if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
4888 {
4889 kfree_skb(skb, FREE_READ);
4890 release_sock(sk);
4891 return 0;
4892 }
4893
4894 /*
4895 * Guess we need to make a new socket up
4896 */
4897
4898 tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());
4899
4900 /*
4901 * Now we have several options: In theory there is nothing else
4902 * in the frame. KA9Q has an option to send data with the syn,
4903 * BSD accepts data with the syn up to the [to be] advertised window
4904 * and Solaris 2.1 gives you a protocol error. For now we just ignore
4905 * it, that fits the spec precisely and avoids incompatibilities. It
4906 * would be nice in future to drop through and process the data.
4907 */
4908
4909 release_sock(sk);
4910 return 0;
4911 }
4912
4913 /* retransmitted SYN? */
4914 if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)
4915 {
4916 kfree_skb(skb, FREE_READ);
4917 release_sock(sk);
4918 return 0;
4919 }
4920
4921 /*
4922 * SYN sent means we have to look for a suitable ack and either reset
4923 * for bad matches or go to connected
4924 */
4925
4926 if(sk->state==TCP_SYN_SENT)
4927 {
4928 /* Crossed SYN or previous junk segment */
4929 if(th->ack)
4930 {
4931 /* We got an ack, but it's not a good ack */
4932 if(!tcp_ack(sk,th,saddr,len))
4933 {
4934 /* Reset the ack - its an ack from a
4935 different connection [ th->rst is checked in tcp_reset()] */
4936 tcp_statistics.TcpAttemptFails++;
4937 tcp_reset(daddr, saddr, th,
4938 sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
4939 kfree_skb(skb, FREE_READ);
4940 release_sock(sk);
4941 return(0);
4942 }
4943 if(th->rst)
4944 return tcp_std_reset(sk,skb);
4945 if(!th->syn)
4946 {
4947 /* A valid ack from a different connection
4948 start. Shouldn't happen but cover it */
4949 kfree_skb(skb, FREE_READ);
4950 release_sock(sk);
4951 return 0;
4952 }
4953 /*
4954 * Ok.. it's good. Set up sequence numbers and
4955 * move to established.
4956 */
4957 syn_ok=1; /* Don't reset this connection for the syn */
4958 sk->acked_seq=th->seq+1;
4959 sk->fin_seq=th->seq;
4960 tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);
4961 tcp_set_state(sk, TCP_ESTABLISHED);
4962 tcp_options(sk,th);
4963 sk->dummy_th.dest=th->source;
4964 sk->copied_seq = sk->acked_seq;
4965 if(!sk->dead)
4966 {
4967 sk->state_change(sk);
4968 sock_wake_async(sk->socket, 0);
4969 }
4970 if(sk->max_window==0)
4971 {
4972 sk->max_window = 32;
4973 sk->mss = min(sk->max_window, sk->mtu);
4974 }
4975 }
4976 else
4977 {
4978 /* See if SYN's cross. Drop if boring */
4979 if(th->syn && !th->rst)
4980 {
4981 /* Crossed SYN's are fine - but talking to
4982 yourself is right out... */
4983 if(sk->saddr==saddr && sk->daddr==daddr &&
4984 sk->dummy_th.source==th->source &&
4985 sk->dummy_th.dest==th->dest)
4986 {
4987 tcp_statistics.TcpAttemptFails++;
4988 return tcp_std_reset(sk,skb);
4989 }
4990 tcp_set_state(sk,TCP_SYN_RECV);
4991
4992 /*
4993 * FIXME:
4994 * Must send SYN|ACK here
4995 */
4996 }
4997 /* Discard junk segment */
4998 kfree_skb(skb, FREE_READ);
4999 release_sock(sk);
5000 return 0;
5001 }
5002 /*
5003 * SYN_RECV with data maybe.. drop through
5004 */
5005 goto rfc_step6;
5006 }
5007
5008 /*
5009 * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
5010 * a more complex suggestion for fixing these reuse issues in RFC1644
5011 * but not yet ready for general use. Also see RFC1379.
5012 */
5013
5014 #define BSD_TIME_WAIT
5015 #ifdef BSD_TIME_WAIT
5016 if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead &&
5017 after(th->seq, sk->acked_seq) && !th->rst)
5018 {
5019 u32 seq = sk->write_seq;
5020 if(sk->debug)
5021 printk("Doing a BSD time wait\n");
5022 tcp_statistics.TcpEstabResets++;
5023 sk->rmem_alloc -= skb->truesize;
5024 skb->sk = NULL;
5025 sk->err=ECONNRESET;
5026 tcp_set_state(sk, TCP_CLOSE);
5027 sk->shutdown = SHUTDOWN_MASK;
5028 release_sock(sk);
5029 sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);
5030 if (sk && sk->state==TCP_LISTEN)
5031 {
5032 sk->inuse=1;
5033 skb->sk = sk;
5034 sk->rmem_alloc += skb->truesize;
5035 tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
5036 release_sock(sk);
5037 return 0;
5038 }
5039 kfree_skb(skb, FREE_READ);
5040 return 0;
5041 }
5042 #endif
5043 }
5044
5045 /*
5046 * We are now in normal data flow (see the step list in the RFC)
5047 * Note most of these are inline now. I'll inline the lot when
5048 * I have time to test it hard and look at what gcc outputs
5049 */
5050
5051 if(!tcp_sequence(sk,th,len,opt,saddr,dev))
5052 {
5053 kfree_skb(skb, FREE_READ);
5054 release_sock(sk);
5055 return 0;
5056 }
5057
5058 if(th->rst)
5059 return tcp_std_reset(sk,skb);
5060
5061 /*
5062 * !syn_ok is effectively the state test in RFC793.
5063 */
5064
5065 if(th->syn && !syn_ok)
5066 {
5067 tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
5068 return tcp_std_reset(sk,skb);
5069 }
5070
5071 /*
5072 * Process the ACK
5073 */
5074
5075
5076 if(th->ack && !tcp_ack(sk,th,saddr,len))
5077 {
5078 /*
5079 * Our three way handshake failed.
5080 */
5081
5082 if(sk->state==TCP_SYN_RECV)
5083 {
5084 tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
5085 }
5086 kfree_skb(skb, FREE_READ);
5087 release_sock(sk);
5088 return 0;
5089 }
5090
5091 rfc_step6: /* I'll clean this up later */
5092
5093 /*
5094 * If the accepted buffer put us over our queue size we
5095 * now drop it (we must process the ack first to avoid
5096 * deadlock cases).
5097 */
5098
5099 if (sk->rmem_alloc >= sk->rcvbuf)
5100 {
5101 kfree_skb(skb, FREE_READ);
5102 release_sock(sk);
5103 return(0);
5104 }
5105
5106
5107 /*
5108 * Process urgent data
5109 */
5110
5111 if(tcp_urg(sk, th, saddr, len))
5112 {
5113 kfree_skb(skb, FREE_READ);
5114 release_sock(sk);
5115 return 0;
5116 }
5117
5118 /*
5119 * Process the encapsulated data
5120 */
5121
5122 if(tcp_data(skb,sk, saddr, len))
5123 {
5124 kfree_skb(skb, FREE_READ);
5125 release_sock(sk);
5126 return 0;
5127 }
5128
5129 /*
5130 * And done
5131 */
5132
5133 release_sock(sk);
5134 return 0;
5135 }
5136
5137 /*
5138 * This routine sends a packet with an out of date sequence
5139 * number. It assumes the other end will try to ack it.
5140 */
5141
5142 static void tcp_write_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5143 {
5144 struct sk_buff *buff,*skb;
5145 struct tcphdr *t1;
5146 struct device *dev=NULL;
5147 int tmp;
5148
5149 if (sk->zapped)
5150 return; /* After a valid reset we can send no more */
5151
5152 /*
5153 * Write data can still be transmitted/retransmitted in the
5154 * following states. If any other state is encountered, return.
5155 * [listen/close will never occur here anyway]
5156 */
5157
5158 if (sk->state != TCP_ESTABLISHED &&
5159 sk->state != TCP_CLOSE_WAIT &&
5160 sk->state != TCP_FIN_WAIT1 &&
5161 sk->state != TCP_LAST_ACK &&
5162 sk->state != TCP_CLOSING
5163 )
5164 {
5165 return;
5166 }
5167 if ( before(sk->sent_seq, sk->window_seq) &&
5168 (skb=skb_peek(&sk->write_queue)))
5169 {
5170 /*
5171 * We are probing the opening of a window
5172 * but the window size is != 0
5173 * must have been a result SWS advoidance ( sender )
5174 */
5175
5176 struct iphdr *iph;
5177 struct tcphdr *th;
5178 struct tcphdr *nth;
5179 unsigned long win_size;
5180 #if 0
5181 unsigned long ow_size;
5182 #endif
5183 void * tcp_data_start;
5184
5185 /*
5186 * How many bytes can we send ?
5187 */
5188
5189 win_size = sk->window_seq - sk->sent_seq;
5190
5191 /*
5192 * Recover the buffer pointers
5193 */
5194
5195 iph = (struct iphdr *)skb->ip_hdr;
5196 th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2));
5197
5198 /*
5199 * Grab the data for a temporary frame
5200 */
5201
5202 buff = sock_wmalloc(sk, win_size + th->doff * 4 +
5203 (iph->ihl << 2) +
5204 sk->prot->max_header + 15,
5205 1, GFP_ATOMIC);
5206 if ( buff == NULL )
5207 return;
5208
5209 /*
5210 * If we strip the packet on the write queue we must
5211 * be ready to retransmit this one
5212 */
5213
5214 buff->free = /*0*/1;
5215
5216 buff->sk = sk;
5217 buff->localroute = sk->localroute;
5218
5219 /*
5220 * Put headers on the new packet
5221 */
5222
5223 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5224 IPPROTO_TCP, sk->opt, buff->truesize,
5225 sk->ip_tos,sk->ip_ttl);
5226 if (tmp < 0)
5227 {
5228 sock_wfree(sk, buff);
5229 return;
5230 }
5231
5232 /*
5233 * Move the TCP header over
5234 */
5235
5236 buff->dev = dev;
5237
5238 nth = (struct tcphdr *) skb_put(buff,th->doff*4);
5239
5240 memcpy(nth, th, th->doff * 4);
5241
5242 /*
5243 * Correct the new header
5244 */
5245
5246 nth->ack = 1;
5247 nth->ack_seq = ntohl(sk->acked_seq);
5248 nth->window = ntohs(tcp_select_window(sk));
5249 nth->check = 0;
5250
5251 /*
5252 * Find the first data byte.
5253 */
5254
5255 tcp_data_start = skb->data + skb->dev->hard_header_len +
5256 (iph->ihl << 2) + th->doff * 4;
5257
5258 /*
5259 * Add it to our new buffer
5260 */
5261 memcpy(skb_put(buff,win_size), tcp_data_start, win_size);
5262
5263 /*
5264 * Remember our right edge sequence number.
5265 */
5266
5267 buff->h.seq = sk->sent_seq + win_size;
5268 sk->sent_seq = buff->h.seq; /* Hack */
5269 #if 0
5270
5271 /*
5272 * now: shrink the queue head segment
5273 */
5274
5275 th->check = 0;
5276 ow_size = skb->len - win_size -
5277 ((unsigned long) (tcp_data_start - (void *) skb->data));
5278
5279 memmove(tcp_data_start, tcp_data_start + win_size, ow_size);
5280 skb_trim(skb,skb->len-win_size);
5281 sk->sent_seq += win_size;
5282 th->seq = htonl(sk->sent_seq);
5283 if (th->urg)
5284 {
5285 unsigned short urg_ptr;
5286
5287 urg_ptr = ntohs(th->urg_ptr);
5288 if (urg_ptr <= win_size)
5289 th->urg = 0;
5290 else
5291 {
5292 urg_ptr -= win_size;
5293 th->urg_ptr = htons(urg_ptr);
5294 nth->urg_ptr = htons(win_size);
5295 }
5296 }
5297 #else
5298 if(th->urg && ntohs(th->urg_ptr) < win_size)
5299 nth->urg = 0;
5300 #endif
5301
5302 /*
5303 * Checksum the split buffer
5304 */
5305
5306 tcp_send_check(nth, sk->saddr, sk->daddr,
5307 nth->doff * 4 + win_size , sk);
5308 }
5309 else
5310 {
5311 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
5312 if (buff == NULL)
5313 return;
5314
5315 buff->free = 1;
5316 buff->sk = sk;
5317 buff->localroute = sk->localroute;
5318
5319 /*
5320 * Put in the IP header and routing stuff.
5321 */
5322
5323 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
5324 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl);
5325 if (tmp < 0)
5326 {
5327 sock_wfree(sk, buff);
5328 return;
5329 }
5330
5331 t1 = (struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
5332 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
5333
5334 /*
5335 * Use a previous sequence.
5336 * This should cause the other end to send an ack.
5337 */
5338
5339 t1->seq = htonl(sk->sent_seq-1);
5340 t1->ack = 1;
5341 t1->res1= 0;
5342 t1->res2= 0;
5343 t1->rst = 0;
5344 t1->urg = 0;
5345 t1->psh = 0;
5346 t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
5347 t1->syn = 0;
5348 t1->ack_seq = ntohl(sk->acked_seq);
5349 t1->window = ntohs(tcp_select_window(sk));
5350 t1->doff = sizeof(*t1)/4;
5351 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk);
5352
5353 }
5354
5355 /*
5356 * Send it.
5357 */
5358
5359 sk->prot->queue_xmit(sk, dev, buff, 1);
5360 tcp_statistics.TcpOutSegs++;
5361 }
5362
5363 /*
5364 * A window probe timeout has occurred.
5365 */
5366
5367 void tcp_send_probe0(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5368 {
5369 if (sk->zapped)
5370 return; /* After a valid reset we can send no more */
5371
5372 tcp_write_wakeup(sk);
5373
5374 sk->backoff++;
5375 sk->rto = min(sk->rto << 1, 120*HZ);
5376 reset_xmit_timer (sk, TIME_PROBE0, sk->rto);
5377 sk->retransmits++;
5378 sk->prot->retransmits ++;
5379 }
5380
5381 /*
5382 * Socket option code for TCP.
5383 */
5384
5385 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5386 {
5387 int val,err;
5388
5389 if(level!=SOL_TCP)
5390 return ip_setsockopt(sk,level,optname,optval,optlen);
5391
5392 if (optval == NULL)
5393 return(-EINVAL);
5394
5395 err=verify_area(VERIFY_READ, optval, sizeof(int));
5396 if(err)
5397 return err;
5398
5399 val = get_user((int *)optval);
5400
5401 switch(optname)
5402 {
5403 case TCP_MAXSEG:
5404 /*
5405 * values greater than interface MTU won't take effect. however at
5406 * the point when this call is done we typically don't yet know
5407 * which interface is going to be used
5408 */
5409 if(val<1||val>MAX_WINDOW)
5410 return -EINVAL;
5411 sk->user_mss=val;
5412 return 0;
5413 case TCP_NODELAY:
5414 sk->nonagle=(val==0)?0:1;
5415 return 0;
5416 default:
5417 return(-ENOPROTOOPT);
5418 }
5419 }
5420
5421 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
5422 {
5423 int val,err;
5424
5425 if(level!=SOL_TCP)
5426 return ip_getsockopt(sk,level,optname,optval,optlen);
5427
5428 switch(optname)
5429 {
5430 case TCP_MAXSEG:
5431 val=sk->user_mss;
5432 break;
5433 case TCP_NODELAY:
5434 val=sk->nonagle;
5435 break;
5436 default:
5437 return(-ENOPROTOOPT);
5438 }
5439 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
5440 if(err)
5441 return err;
5442 put_user(sizeof(int),(int *) optlen);
5443
5444 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
5445 if(err)
5446 return err;
5447 put_user(val,(int *)optval);
5448
5449 return(0);
5450 }
5451
5452
5453 struct proto tcp_prot = {
5454 tcp_close,
5455 ip_build_header,
5456 tcp_connect,
5457 tcp_accept,
5458 ip_queue_xmit,
5459 tcp_retransmit,
5460 tcp_write_wakeup,
5461 tcp_read_wakeup,
5462 tcp_rcv,
5463 tcp_select,
5464 tcp_ioctl,
5465 NULL,
5466 tcp_shutdown,
5467 tcp_setsockopt,
5468 tcp_getsockopt,
5469 tcp_sendmsg,
5470 tcp_recvmsg,
5471 NULL, /* No special bind() */
5472 128,
5473 0,
5474 "TCP",
5475 0, 0,
5476 {NULL,}
5477 };