1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: @(#)tcp.c 1.0.16 05/25/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. select
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle select() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), select() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in selecting before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : Select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if stat is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathalogical case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 *
192 * To Fix:
193 * Fast path the code. Two things here - fix the window calculation
194 * so it doesn't iterate over the queue, also spot packets with no funny
195 * options arriving in order and process directly.
196 *
197 * Rewrite output state machine to use a single queue.
198 * Speed up input assembly algorithm.
199 * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
200 * could do with it working on IPv4
201 * User settable/learned rtt/max window/mtu
202 *
203 * Change the fundamental structure to a single send queue maintained
204 * by TCP (removing the bogus ip stuff [thus fixing mtu drops on
205 * active routes too]). Cut the queue off in tcp_retransmit/
206 * tcp_transmit.
207 * Change the receive queue to assemble as it goes. This lets us
208 * dispose of most of tcp_sequence, half of tcp_ack and chunks of
209 * tcp_data/tcp_read as well as the window shrink crud.
210 * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
211 * tcp_queue_skb seem obvious routines to extract.
212 *
213 * This program is free software; you can redistribute it and/or
214 * modify it under the terms of the GNU General Public License
215 * as published by the Free Software Foundation; either version
216 * 2 of the License, or(at your option) any later version.
217 *
218 * Description of States:
219 *
220 * TCP_SYN_SENT sent a connection request, waiting for ack
221 *
222 * TCP_SYN_RECV received a connection request, sent ack,
223 * waiting for final ack in three-way handshake.
224 *
225 * TCP_ESTABLISHED connection established
226 *
227 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
228 * transmission of remaining buffered data
229 *
230 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
231 * to shutdown
232 *
233 * TCP_CLOSING both sides have shutdown but we still have
234 * data we have to finish sending
235 *
236 * TCP_TIME_WAIT timeout to catch resent junk before entering
237 * closed, can only be entered from FIN_WAIT2
238 * or CLOSING. Required because the other end
239 * may not have gotten our last ACK causing it
240 * to retransmit the data packet (which we ignore)
241 *
242 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
243 * us to finish writing our data and to shutdown
244 * (we have to close() to move on to LAST_ACK)
245 *
246 * TCP_LAST_ACK out side has shutdown after remote has
247 * shutdown. There may still be data in our
248 * buffer that we have to finish sending
249 *
250 * TCP_CLOSE socket is finished
251 */
252
253 /*
254 * RFC1122 status:
255 * NOTE: I'm not going to be doing comments in the code for this one except
256 * for violations and the like. tcp.c is just too big... If I say something
257 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
258 * with Alan. -- MS 950903
259 *
260 * Use of PSH (4.2.2.2)
261 * MAY aggregate data sent without the PSH flag. (does)
262 * MAY queue data received without the PSH flag. (does)
263 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
264 * MAY implement PSH on send calls. (doesn't, thus:)
265 * MUST NOT buffer data indefinitely (doesn't [1 second])
266 * MUST set PSH on last segment (does)
267 * MAY pass received PSH to application layer (doesn't)
268 * SHOULD send maximum-sized segment whenever possible. (almost always does)
269 *
270 * Window Size (4.2.2.3, 4.2.2.16)
271 * MUST treat window size as an unsigned number (does)
272 * SHOULD treat window size as a 32-bit number (does not)
273 * MUST NOT shrink window once it is offered (does not normally)
274 *
275 * Urgent Pointer (4.2.2.4)
276 * **MUST point urgent pointer to last byte of urgent data (not right
277 * after). (doesn't, to be like BSD)
278 * MUST inform application layer asynchronously of incoming urgent
279 * data. (does)
280 * MUST provide application with means of determining the amount of
281 * urgent data pending. (does)
282 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
283 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
284 * [Follows BSD 1 byte of urgent data]
285 *
286 * TCP Options (4.2.2.5)
287 * MUST be able to receive TCP options in any segment. (does)
288 * MUST ignore unsupported options (does)
289 *
290 * Maximum Segment Size Option (4.2.2.6)
291 * MUST implement both sending and receiving MSS. (does)
292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
293 * it always). (does, even when MSS == 536, which is legal)
294 * MUST assume MSS == 536 if no MSS received at connection setup (does)
295 * MUST calculate "effective send MSS" correctly:
296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
297 * (does - but allows operator override)
298 *
299 * TCP Checksum (4.2.2.7)
300 * MUST generate and check TCP checksum. (does)
301 *
302 * Initial Sequence Number Selection (4.2.2.8)
303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
305 * necessary for 10Mbps networks - and harder than BSD to spoof!)
306 *
307 * Simultaneous Open Attempts (4.2.2.10)
308 * MUST support simultaneous open attempts (does)
309 *
310 * Recovery from Old Duplicate SYN (4.2.2.11)
311 * MUST keep track of active vs. passive open (does)
312 *
313 * RST segment (4.2.2.12)
314 * SHOULD allow an RST segment to contain data (does, but doesn't do
315 * anything with it, which is standard)
316 *
317 * Closing a Connection (4.2.2.13)
318 * MUST inform application of whether connectin was closed by RST or
319 * normal close. (does)
320 * MAY allow "half-duplex" close (treat connection as closed for the
321 * local app, even before handshake is done). (does)
322 * MUST linger in TIME_WAIT for 2 * MSL (does)
323 *
324 * Retransmission Timeout (4.2.2.15)
325 * MUST implement Jacobson's slow start and congestion avoidance
326 * stuff. (does)
327 *
328 * Probing Zero Windows (4.2.2.17)
329 * MUST support probing of zero windows. (does)
330 * MAY keep offered window closed indefinitely. (does)
331 * MUST allow remote window to stay closed indefinitely. (does)
332 *
333 * Passive Open Calls (4.2.2.18)
334 * MUST NOT let new passive open affect other connections. (doesn't)
335 * MUST support passive opens (LISTENs) concurrently. (does)
336 *
337 * Time to Live (4.2.2.19)
338 * MUST make TCP TTL configurable. (does - IP_TTL option)
339 *
340 * Event Processing (4.2.2.20)
341 * SHOULD queue out-of-order segments. (does)
342 * MUST aggregate ACK segments whenever possible. (does but badly)
343 *
344 * Retransmission Timeout Calculation (4.2.3.1)
345 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
346 * calculation. (does, or at least explains them in the comments 8*b)
347 * SHOULD initialize RTO to 0 and RTT to 3. (does)
348 *
349 * When to Send an ACK Segment (4.2.3.2)
350 * SHOULD implement delayed ACK. (does)
351 * MUST keep ACK delay < 0.5 sec. (does)
352 *
353 * When to Send a Window Update (4.2.3.3)
354 * MUST implement receiver-side SWS. (does)
355 *
356 * When to Send Data (4.2.3.4)
357 * MUST implement sender-side SWS. (does)
358 * SHOULD implement Nagle algorithm. (does)
359 *
360 * TCP Connection Failures (4.2.3.5)
361 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
362 * SHOULD inform application layer of soft errors. (does)
363 *
364 * TCP Keep-Alives (4.2.3.6)
365 * MAY provide keep-alives. (does)
366 * MUST make keep-alives configurable on a per-connection basis. (does)
367 * MUST default to no keep-alives. (does)
368 * **MUST make keep-alive interval configurable. (doesn't)
369 * **MUST make default keep-alive interval > 2 hours. (doesn't)
370 * MUST NOT interpret failure to ACK keep-alive packet as dead
371 * connection. (doesn't)
372 * SHOULD send keep-alive with no data. (does)
373 *
374 * TCP Multihoming (4.2.3.7)
375 * MUST get source address from IP layer before sending first
376 * SYN. (does)
377 * MUST use same local address for all segments of a connection. (does)
378 *
379 * IP Options (4.2.3.8)
380 * MUST ignore unsupported IP options. (does)
381 * MAY support Time Stamp and Record Route. (does)
382 * MUST allow application to specify a source route. (does)
383 * MUST allow receieved Source Route option to set route for all future
384 * segments on this connection. (does not (security issues))
385 *
386 * ICMP messages (4.2.3.9)
387 * MUST act on ICMP errors. (does)
388 * MUST slow transmission upon receipt of a Source Quench. (does)
389 * MUST NOT abort connection upon receipt of soft Destination
390 * Unreachables (0, 1, 5), Time Exceededs and Parameter
391 * Problems. (doesn't)
392 * SHOULD report soft Destination Unreachables etc. to the
393 * application. (does)
394 * SHOULD abort connection upon receipt of hard Destination Unreachable
395 * messages (2, 3, 4). (does)
396 *
397 * Remote Address Validation (4.2.3.10)
398 * MUST reject as an error OPEN for invalid remote IP address. (does)
399 * MUST ignore SYN with invalid source address. (does)
400 * MUST silently discard incoming SYN for broadcast/multicast
401 * address. (does)
402 *
403 * Asynchronous Reports (4.2.4.1)
404 * MUST provide mechanism for reporting soft errors to application
405 * layer. (does)
406 *
407 * Type of Service (4.2.4.2)
408 * MUST allow application layer to set Type of Service. (does IP_TOS)
409 *
410 * (Whew. -- MS 950903)
411 **/
412
413 #include <linux/config.h>
414 #include <linux/types.h>
415 #include <linux/fcntl.h>
416
417 #include <net/icmp.h>
418 #include <net/tcp.h>
419
420 #include <asm/segment.h>
421
422 unsigned long seq_offset;
423 struct tcp_mib tcp_statistics;
424
425 static void tcp_close(struct sock *sk, unsigned long timeout);
426
427 /*
428 * The less said about this the better, but it works and will do for 1.2 (and 1.4 ;))
429 */
430
431 struct wait_queue *master_select_wakeup;
432
433 /*
434 * Find someone to 'accept'. Must be called with
435 * the socket locked or with interrupts disabled
436 */
437
438 static struct sk_buff *tcp_find_established(struct sock *s)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
439 {
440 struct sk_buff *p=skb_peek(&s->receive_queue);
441 if(p==NULL)
442 return NULL;
443 do
444 {
445 if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
446 return p;
447 p=p->next;
448 }
449 while(p!=(struct sk_buff *)&s->receive_queue);
450 return NULL;
451 }
452
453 /*
454 * Remove a completed connection and return it. This is used by
455 * tcp_accept() to get connections from the queue.
456 */
457
458 static struct sk_buff *tcp_dequeue_established(struct sock *s)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
459 {
460 struct sk_buff *skb;
461 unsigned long flags;
462 save_flags(flags);
463 cli();
464 skb=tcp_find_established(s);
465 if(skb!=NULL)
466 skb_unlink(skb); /* Take it off the queue */
467 restore_flags(flags);
468 return skb;
469 }
470
471 /*
472 * This routine closes sockets which have been at least partially
473 * opened, but not yet accepted. Currently it is only called by
474 * tcp_close, and timeout mirrors the value there.
475 */
476
477 static void tcp_close_pending (struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
478 {
479 struct sk_buff *skb;
480
481 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
482 {
483 tcp_close(skb->sk, 0);
484 kfree_skb(skb, FREE_READ);
485 }
486 return;
487 }
488
489 /*
490 * Enter the time wait state.
491 */
492
493 void tcp_time_wait(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
494 {
495 tcp_set_state(sk,TCP_TIME_WAIT);
496 sk->shutdown = SHUTDOWN_MASK;
497 if (!sk->dead)
498 sk->state_change(sk);
499 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
500 }
501
502
503 /*
504 * This routine is called by the ICMP module when it gets some
505 * sort of error condition. If err < 0 then the socket should
506 * be closed and the error returned to the user. If err > 0
507 * it's just the icmp type << 8 | icmp code. After adjustment
508 * header points to the first 8 bytes of the tcp header. We need
509 * to find the appropriate port.
510 */
511
512 void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
513 __u32 saddr, struct inet_protocol *protocol)
514 {
515 struct tcphdr *th = (struct tcphdr *)header;
516 struct sock *sk;
517
518 /*
519 * This one is _WRONG_. FIXME urgently.
520 */
521 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
522 struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
523 #endif
524 th =(struct tcphdr *)header;
525 sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr);
526
527 if (sk == NULL)
528 return;
529
530 if (type == ICMP_SOURCE_QUENCH)
531 {
532 /*
533 * FIXME:
534 * For now we will just trigger a linear backoff.
535 * The slow start code should cause a real backoff here.
536 */
537 if (sk->cong_window > 4)
538 sk->cong_window--;
539 return;
540 }
541
542 if (type == ICMP_PARAMETERPROB)
543 {
544 sk->err=EPROTO;
545 sk->error_report(sk);
546 }
547
548 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
549 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
550 {
551 struct rtable * rt;
552 /*
553 * Ugly trick to pass MTU to protocol layer.
554 * Really we should add argument "info" to error handler.
555 */
556 unsigned short new_mtu = ntohs(iph->id);
557
558 if ((rt = sk->ip_route_cache) != NULL)
559 if (rt->rt_mtu > new_mtu)
560 rt->rt_mtu = new_mtu;
561
562 if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
563 && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
564 sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
565
566 return;
567 }
568 #endif
569
570 /*
571 * If we've already connected we will keep trying
572 * until we time out, or the user gives up.
573 */
574
575 if (code < 13)
576 {
577 if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
578 {
579 sk->err = icmp_err_convert[code].errno;
580 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
581 {
582 tcp_statistics.TcpAttemptFails++;
583 tcp_set_state(sk,TCP_CLOSE);
584 sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
585 }
586 }
587 else /* Only an error on timeout */
588 sk->err_soft = icmp_err_convert[code].errno;
589 }
590 }
591
592
593 /*
594 * Walk down the receive queue counting readable data until we hit the end or we find a gap
595 * in the received data queue (ie a frame missing that needs sending to us). Not
596 * sorting using two queues as data arrives makes life so much harder.
597 */
598
599 static int tcp_readable(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
600 {
601 unsigned long counted;
602 unsigned long amount;
603 struct sk_buff *skb;
604 int sum;
605 unsigned long flags;
606
607 if(sk && sk->debug)
608 printk("tcp_readable: %p - ",sk);
609
610 save_flags(flags);
611 cli();
612 if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
613 {
614 restore_flags(flags);
615 if(sk && sk->debug)
616 printk("empty\n");
617 return(0);
618 }
619
620 counted = sk->copied_seq; /* Where we are at the moment */
621 amount = 0;
622
623 /*
624 * Do until a push or until we are out of data.
625 */
626
627 do
628 {
629 if (before(counted, skb->seq)) /* Found a hole so stops here */
630 break;
631 sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */
632 if (skb->h.th->syn)
633 sum++;
634 if (sum > 0)
635 { /* Add it up, move on */
636 amount += sum;
637 if (skb->h.th->syn)
638 amount--;
639 counted += sum;
640 }
641 /*
642 * Don't count urg data ... but do it in the right place!
643 * Consider: "old_data (ptr is here) URG PUSH data"
644 * The old code would stop at the first push because
645 * it counted the urg (amount==1) and then does amount--
646 * *after* the loop. This means tcp_readable() always
647 * returned zero if any URG PUSH was in the queue, even
648 * though there was normal data available. If we subtract
649 * the urg data right here, we even get it to work for more
650 * than one URG PUSH skb without normal data.
651 * This means that select() finally works now with urg data
652 * in the queue. Note that rlogin was never affected
653 * because it doesn't use select(); it uses two processes
654 * and a blocking read(). And the queue scan in tcp_read()
655 * was correct. Mike <pall@rz.uni-karlsruhe.de>
656 */
657 if (skb->h.th->urg)
658 amount--; /* don't count urg data */
659 if (amount && skb->h.th->psh) break;
660 skb = skb->next;
661 }
662 while(skb != (struct sk_buff *)&sk->receive_queue);
663
664 restore_flags(flags);
665 if(sk->debug)
666 printk("got %lu bytes.\n",amount);
667 return(amount);
668 }
669
670 /*
671 * LISTEN is a special case for select..
672 */
673 static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
674 {
675 if (sel_type == SEL_IN) {
676 int retval;
677
678 lock_sock(sk);
679 retval = (tcp_find_established(sk) != NULL);
680 release_sock(sk);
681 if (!retval)
682 select_wait(&master_select_wakeup,wait);
683 return retval;
684 }
685 return 0;
686 }
687
688
689 /*
690 * Wait for a TCP event.
691 *
692 * Note that we don't need to lock the socket, as the upper select layers
693 * take care of normal races (between the test and the event) and we don't
694 * go look at any of the socket buffers directly.
695 */
696 static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
697 {
698 if (sk->state == TCP_LISTEN)
699 return tcp_listen_select(sk, sel_type, wait);
700
701 switch(sel_type) {
702 case SEL_IN:
703 if (sk->err)
704 return 1;
705 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
706 break;
707
708 if (sk->shutdown & RCV_SHUTDOWN)
709 return 1;
710
711 if (sk->acked_seq == sk->copied_seq)
712 break;
713
714 if (sk->urg_seq != sk->copied_seq ||
715 sk->acked_seq != sk->copied_seq+1 ||
716 sk->urginline || !sk->urg_data)
717 return 1;
718 break;
719
720 case SEL_OUT:
721 if (sk->err)
722 return 1;
723 if (sk->shutdown & SEND_SHUTDOWN)
724 return 0;
725 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
726 break;
727 /*
728 * This is now right thanks to a small fix
729 * by Matt Dillon.
730 */
731
732 if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
733 break;
734 return 1;
735
736 case SEL_EX:
737 if (sk->urg_data)
738 return 1;
739 break;
740 }
741 select_wait(sk->sleep, wait);
742 return 0;
743 }
744
745 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
746 {
747 int err;
748 switch(cmd)
749 {
750
751 case TIOCINQ:
752 #ifdef FIXME /* FIXME: */
753 case FIONREAD:
754 #endif
755 {
756 unsigned long amount;
757
758 if (sk->state == TCP_LISTEN)
759 return(-EINVAL);
760
761 lock_sock(sk);
762 amount = tcp_readable(sk);
763 release_sock(sk);
764 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
765 if(err)
766 return err;
767 put_user(amount, (int *)arg);
768 return(0);
769 }
770 case SIOCATMARK:
771 {
772 int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
773
774 err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
775 if (err)
776 return err;
777 put_user(answ,(int *) arg);
778 return(0);
779 }
780 case TIOCOUTQ:
781 {
782 unsigned long amount;
783
784 if (sk->state == TCP_LISTEN) return(-EINVAL);
785 amount = sock_wspace(sk);
786 err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
787 if(err)
788 return err;
789 put_user(amount, (int *)arg);
790 return(0);
791 }
792 default:
793 return(-EINVAL);
794 }
795 }
796
797
798 /*
799 * This routine computes a TCP checksum.
800 *
801 * Modified January 1995 from a go-faster DOS routine by
802 * Jorge Cwik <jorge@laser.satlink.net>
803 */
804 #undef DEBUG_TCP_CHECK
805 void tcp_send_check(struct tcphdr *th, unsigned long saddr,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
806 unsigned long daddr, int len, struct sk_buff *skb)
807 {
808 #ifdef DEBUG_TCP_CHECK
809 u16 check;
810 #endif
811 th->check = 0;
812 th->check = tcp_check(th, len, saddr, daddr,
813 csum_partial((char *)th,sizeof(*th),skb->csum));
814
815 #ifdef DEBUG_TCP_CHECK
816 check = th->check;
817 th->check = 0;
818 th->check = tcp_check(th, len, saddr, daddr,
819 csum_partial((char *)th,len,0));
820 if (check != th->check) {
821 static int count = 0;
822 if (++count < 10) {
823 printk("Checksum %x (%x) from %p\n", th->check, check,
824 (&th)[-1]);
825 printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
826 }
827 }
828 #endif
829 }
830
831
832 /*
833 * This routine builds a generic TCP header.
834 */
835
836 extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
837 {
838
839 memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
840 th->seq = htonl(sk->write_seq);
841 th->psh =(push == 0) ? 1 : 0;
842 sk->ack_backlog = 0;
843 sk->bytes_rcv = 0;
844 sk->ack_timed = 0;
845 th->ack_seq = htonl(sk->acked_seq);
846 sk->window = tcp_select_window(sk);
847 th->window = htons(sk->window);
848
849 return(sizeof(*th));
850 }
851
852 /*
853 * Wait for a socket to get into the connected state
854 */
855 static void wait_for_tcp_connect(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
856 {
857 release_sock(sk);
858 cli();
859 if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
860 {
861 interruptible_sleep_on(sk->sleep);
862 }
863 sti();
864 lock_sock(sk);
865 }
866
867 /*
868 * Wait for more memory for a socket
869 */
870 static void wait_for_tcp_memory(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
871 {
872 release_sock(sk);
873 cli();
874 if (sk->wmem_alloc*2 > sk->sndbuf &&
875 (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
876 && sk->err == 0)
877 {
878 sk->socket->flags &= ~SO_NOSPACE;
879 interruptible_sleep_on(sk->sleep);
880 }
881 sti();
882 lock_sock(sk);
883 }
884
885
886 /*
887 * This routine copies from a user buffer into a socket,
888 * and starts the transmit system.
889 */
890
891 static int do_tcp_sendmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
892 int len, int nonblock, int flags)
893 {
894 int copied = 0;
895 int copy;
896 int tmp;
897 int seglen;
898 int iovct=0;
899 struct sk_buff *skb;
900 struct sk_buff *send_tmp;
901 struct proto *prot;
902 struct device *dev = NULL;
903 unsigned char *from;
904
905 /*
906 * Ok commence sending
907 */
908
909 while(iovct<msg->msg_iovlen)
910 {
911 seglen=msg->msg_iov[iovct].iov_len;
912 from=msg->msg_iov[iovct++].iov_base;
913 prot = sk->prot;
914 while(seglen > 0)
915 {
916 /*
917 * Stop on errors
918 */
919 if (sk->err)
920 {
921 if (copied)
922 return copied;
923 return sock_error(sk);
924 }
925
926 /*
927 * Make sure that we are established.
928 */
929 if (sk->shutdown & SEND_SHUTDOWN)
930 {
931 if (copied)
932 return copied;
933 return -EPIPE;
934 }
935
936 /*
937 * Wait for a connection to finish.
938 */
939 while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
940 {
941 if (copied)
942 return copied;
943
944 if (sk->err)
945 return sock_error(sk);
946
947 if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
948 {
949 if (sk->keepopen)
950 send_sig(SIGPIPE, current, 0);
951 return -EPIPE;
952 }
953
954 if (nonblock)
955 return -EAGAIN;
956
957 if (current->signal & ~current->blocked)
958 return -ERESTARTSYS;
959
960 wait_for_tcp_connect(sk);
961 }
962
963 /*
964 * The following code can result in copy <= if sk->mss is ever
965 * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
966 * sk->mtu is constant once SYN processing is finished. I.e. we
967 * had better not get here until we've seen his SYN and at least one
968 * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
969 * But ESTABLISHED should guarantee that. sk->max_window is by definition
970 * non-decreasing. Note that any ioctl to set user_mss must be done
971 * before the exchange of SYN's. If the initial ack from the other
972 * end has a window of 0, max_window and thus mss will both be 0.
973 */
974
975 /*
976 * Now we need to check if we have a half built packet.
977 */
978 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
979 /*
980 * FIXME: I'm almost sure that this fragment is BUG,
981 * but it works... I do not know why 8) --ANK
982 *
983 * Really, we should rebuild all the queues...
984 * It's difficult. Temprorary hack is to send all
985 * queued segments with allowed fragmentation.
986 */
987 {
988 int new_mss = min(sk->mtu, sk->max_window);
989 if (new_mss < sk->mss)
990 {
991 tcp_send_partial(sk);
992 sk->mss = new_mss;
993 }
994 }
995 #endif
996
997 if ((skb = tcp_dequeue_partial(sk)) != NULL)
998 {
999 int tcp_size;
1000
1001 tcp_size = skb->tail - (unsigned char *)(skb->h.th + 1);
1002
1003 /* Add more stuff to the end of skb->len */
1004 if (!(flags & MSG_OOB))
1005 {
1006 copy = min(sk->mss - tcp_size, seglen);
1007 if (copy <= 0)
1008 {
1009 printk("TCP: **bug**: \"copy\" <= 0\n");
1010 return -EFAULT;
1011 }
1012 tcp_size += copy;
1013 memcpy_fromfs(skb_put(skb,copy), from, copy);
1014 skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
1015 from += copy;
1016 copied += copy;
1017 len -= copy;
1018 sk->write_seq += copy;
1019 seglen -= copy;
1020 }
1021 if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
1022 tcp_send_skb(sk, skb);
1023 else
1024 tcp_enqueue_partial(skb, sk);
1025 continue;
1026 }
1027
1028 /*
1029 * We also need to worry about the window.
1030 * If window < 1/2 the maximum window we've seen from this
1031 * host, don't use it. This is sender side
1032 * silly window prevention, as specified in RFC1122.
1033 * (Note that this is different than earlier versions of
1034 * SWS prevention, e.g. RFC813.). What we actually do is
1035 * use the whole MSS. Since the results in the right
1036 * edge of the packet being outside the window, it will
1037 * be queued for later rather than sent.
1038 */
1039
1040 copy = sk->window_seq - sk->write_seq;
1041 if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
1042 copy = sk->mss;
1043 if (copy > seglen)
1044 copy = seglen;
1045
1046 /*
1047 * We should really check the window here also.
1048 */
1049
1050 send_tmp = NULL;
1051 if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
1052 {
1053 skb = sock_wmalloc(sk, sk->mtu + 128 + prot->max_header + 15, 0, GFP_KERNEL);
1054 send_tmp = skb;
1055 }
1056 else
1057 {
1058 skb = sock_wmalloc(sk, copy + prot->max_header + 15 , 0, GFP_KERNEL);
1059 }
1060
1061 /*
1062 * If we didn't get any memory, we need to sleep.
1063 */
1064
1065 if (skb == NULL)
1066 {
1067 sk->socket->flags |= SO_NOSPACE;
1068 if (nonblock)
1069 {
1070 if (copied)
1071 return copied;
1072 return -EAGAIN;
1073 }
1074
1075 if (current->signal & ~current->blocked)
1076 {
1077 if (copied)
1078 return copied;
1079 return -ERESTARTSYS;
1080 }
1081
1082 wait_for_tcp_memory(sk);
1083 continue;
1084 }
1085
1086 skb->sk = sk;
1087 skb->free = 0;
1088 skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
1089
1090 /*
1091 * FIXME: we need to optimize this.
1092 * Perhaps some hints here would be good.
1093 */
1094
1095 tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
1096 IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1097 if (tmp < 0 )
1098 {
1099 sock_wfree(sk, skb);
1100 if (copied)
1101 return(copied);
1102 return(tmp);
1103 }
1104 #ifndef CONFIG_NO_PATH_MTU_DISCOVERY
1105 skb->ip_hdr->frag_off |= htons(IP_DF);
1106 #endif
1107 skb->dev = dev;
1108 skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
1109 tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
1110 if (tmp < 0)
1111 {
1112 sock_wfree(sk, skb);
1113 if (copied)
1114 return(copied);
1115 return(tmp);
1116 }
1117
1118 if (flags & MSG_OOB)
1119 {
1120 skb->h.th->urg = 1;
1121 skb->h.th->urg_ptr = ntohs(copy);
1122 }
1123
1124 skb->csum = csum_partial_copy_fromuser(from,
1125 skb_put(skb,copy), copy, 0);
1126
1127 from += copy;
1128 copied += copy;
1129 len -= copy;
1130 seglen -= copy;
1131 skb->free = 0;
1132 sk->write_seq += copy;
1133
1134 if (send_tmp != NULL)
1135 {
1136 tcp_enqueue_partial(send_tmp, sk);
1137 continue;
1138 }
1139 tcp_send_skb(sk, skb);
1140 }
1141 }
1142 sk->err = 0;
1143
1144 return copied;
1145 }
1146
1147
1148 static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1149 int len, int nonblock, int flags)
1150 {
1151 int retval = -EINVAL;
1152
1153 /*
1154 * Do sanity checking for sendmsg/sendto/send
1155 */
1156
1157 if (flags & ~(MSG_OOB|MSG_DONTROUTE))
1158 goto out;
1159 if (msg->msg_name) {
1160 struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
1161
1162 if (msg->msg_namelen < sizeof(*addr))
1163 goto out;
1164 if (addr->sin_family && addr->sin_family != AF_INET)
1165 goto out;
1166 retval = -ENOTCONN;
1167 if(sk->state == TCP_CLOSE)
1168 goto out;
1169 retval = -EISCONN;
1170 if (addr->sin_port != sk->dummy_th.dest)
1171 goto out;
1172 if (addr->sin_addr.s_addr != sk->daddr)
1173 goto out;
1174 }
1175
1176 lock_sock(sk);
1177 retval = do_tcp_sendmsg(sk, msg, len, nonblock, flags);
1178
1179 /*
1180 * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
1181 * interactive fast network servers. It's meant to be on and
1182 * it really improves the throughput though not the echo time
1183 * on my slow slip link - Alan
1184 *
1185 * If not nagling we can send on the before case too..
1186 */
1187
1188 if (sk->partial) {
1189 if (!sk->packets_out ||
1190 (sk->nonagle && before(sk->write_seq , sk->window_seq))) {
1191 tcp_send_partial(sk);
1192 }
1193 }
1194
1195 release_sock(sk);
1196
1197 out:
1198 return retval;
1199 }
1200
1201
1202 /*
1203 * Send an ack if one is backlogged at this point. Ought to merge
1204 * this with tcp_send_ack().
1205 * This is called for delayed acks also.
1206 */
1207
1208 void tcp_read_wakeup(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1209 {
1210 int tmp;
1211 struct device *dev = NULL;
1212 struct tcphdr *t1;
1213 struct sk_buff *buff;
1214
1215 if (!sk->ack_backlog)
1216 return;
1217
1218 /*
1219 * If we're closed, don't send an ack, or we'll get a RST
1220 * from the closed destination.
1221 */
1222 if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
1223 return;
1224
1225 /*
1226 * FIXME: we need to put code here to prevent this routine from
1227 * being called. Being called once in a while is ok, so only check
1228 * if this is the second time in a row.
1229 */
1230
1231 /*
1232 * We need to grab some memory, and put together an ack,
1233 * and then put it into the queue to be sent.
1234 */
1235
1236 buff = sock_wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC);
1237 if (buff == NULL)
1238 {
1239 /* Try again real soon. */
1240 tcp_reset_xmit_timer(sk, TIME_WRITE, HZ);
1241 return;
1242 }
1243
1244 buff->sk = sk;
1245 buff->localroute = sk->localroute;
1246 buff->csum = 0;
1247
1248 /*
1249 * Put in the IP header and routing stuff.
1250 */
1251
1252 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
1253 IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
1254 if (tmp < 0)
1255 {
1256 buff->free = 1;
1257 sock_wfree(sk, buff);
1258 return;
1259 }
1260
1261 t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr));
1262
1263 memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
1264 t1->seq = htonl(sk->sent_seq);
1265
1266 sk->ack_backlog = 0;
1267 sk->bytes_rcv = 0;
1268
1269 sk->window = tcp_select_window(sk);
1270 t1->window = htons(sk->window);
1271 t1->ack_seq = htonl(sk->acked_seq);
1272 t1->doff = sizeof(*t1)/4;
1273 tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), buff);
1274 sk->prot->queue_xmit(sk, dev, buff, 1);
1275 tcp_statistics.TcpOutSegs++;
1276 }
1277
1278
1279 /*
1280 * Handle reading urgent data. BSD has very simple semantics for
1281 * this, no blocking and very strange errors 8)
1282 */
1283
1284 static int tcp_recv_urg(struct sock * sk, int nonblock,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1285 struct msghdr *msg, int len, int flags, int *addr_len)
1286 {
1287 /*
1288 * No URG data to read
1289 */
1290 if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
1291 return -EINVAL; /* Yes this is right ! */
1292
1293 if (sk->err)
1294 return sock_error(sk);
1295
1296 if (sk->state == TCP_CLOSE || sk->done)
1297 {
1298 if (!sk->done)
1299 {
1300 sk->done = 1;
1301 return 0;
1302 }
1303 return -ENOTCONN;
1304 }
1305
1306 if (sk->shutdown & RCV_SHUTDOWN)
1307 {
1308 sk->done = 1;
1309 return 0;
1310 }
1311 lock_sock(sk);
1312 if (sk->urg_data & URG_VALID)
1313 {
1314 char c = sk->urg_data;
1315 if (!(flags & MSG_PEEK))
1316 sk->urg_data = URG_READ;
1317 memcpy_toiovec(msg->msg_iov, &c, 1);
1318 if(msg->msg_name)
1319 {
1320 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1321 sin->sin_family=AF_INET;
1322 sin->sin_addr.s_addr=sk->daddr;
1323 sin->sin_port=sk->dummy_th.dest;
1324 }
1325 if(addr_len)
1326 *addr_len=sizeof(struct sockaddr_in);
1327 release_sock(sk);
1328 return 1;
1329 }
1330 release_sock(sk);
1331
1332 /*
1333 * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1334 * the available implementations agree in this case:
1335 * this call should never block, independent of the
1336 * blocking state of the socket.
1337 * Mike <pall@rz.uni-karlsruhe.de>
1338 */
1339 return -EAGAIN;
1340 }
1341
1342 /*
1343 * Release a skb if it is no longer needed. This routine
1344 * must be called with interrupts disabled or with the
1345 * socket locked so that the sk_buff queue operation is ok.
1346 */
1347
1348 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1349 {
1350 sk->ack_backlog++;
1351 skb->sk = sk;
1352 __skb_unlink(skb, &sk->receive_queue);
1353 kfree_skb(skb, FREE_READ);
1354 }
1355
1356 /*
1357 * FIXME:
1358 * This routine frees used buffers.
1359 * It should consider sending an ACK to let the
1360 * other end know we now have a bigger window.
1361 */
1362
1363 static void cleanup_rbuf(struct sock *sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1364 {
1365 struct sk_buff *skb;
1366 unsigned long rspace;
1367
1368 /*
1369 * NOTE! The socket must be locked, so that we don't get
1370 * a messed-up receive queue.
1371 */
1372 while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1373 if (!skb->used || skb->users)
1374 break;
1375 tcp_eat_skb(sk, skb);
1376 }
1377
1378 /*
1379 * FIXME:
1380 * At this point we should send an ack if the difference
1381 * in the window, and the amount of space is bigger than
1382 * TCP_WINDOW_DIFF.
1383 */
1384
1385 rspace=sock_rspace(sk);
1386 if(sk->debug)
1387 printk("sk->rspace = %lu\n", rspace);
1388 /*
1389 * This area has caused the most trouble. The current strategy
1390 * is to simply do nothing if the other end has room to send at
1391 * least 3 full packets, because the ack from those will auto-
1392 * matically update the window. If the other end doesn't think
1393 * we have much space left, but we have room for at least 1 more
1394 * complete packet than it thinks we do, we will send an ack
1395 * immediately. Otherwise we will wait up to .5 seconds in case
1396 * the user reads some more.
1397 */
1398
1399 /*
1400 * It's unclear whether to use sk->mtu or sk->mss here. They differ only
1401 * if the other end is offering a window smaller than the agreed on MSS
1402 * (called sk->mtu here). In theory there's no connection between send
1403 * and receive, and so no reason to think that they're going to send
1404 * small packets. For the moment I'm using the hack of reducing the mss
1405 * only on the send side, so I'm putting mtu here.
1406 */
1407
1408 if (rspace > (sk->window - sk->bytes_rcv + sk->mtu))
1409 {
1410 /* Send an ack right now. */
1411 tcp_read_wakeup(sk);
1412 }
1413 else
1414 {
1415 /* Force it to send an ack soon. */
1416 int was_active = del_timer(&sk->retransmit_timer);
1417 if (!was_active || jiffies+TCP_ACK_TIME < sk->timer.expires)
1418 {
1419 tcp_reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME);
1420 }
1421 else
1422 add_timer(&sk->retransmit_timer);
1423 }
1424 }
1425
1426
1427 /*
1428 * This routine copies from a sock struct into the user buffer.
1429 */
1430
1431 static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1432 int len, int nonblock, int flags, int *addr_len)
1433 {
1434 struct wait_queue wait = { current, NULL };
1435 int copied = 0;
1436 u32 peek_seq;
1437 volatile u32 *seq; /* So gcc doesn't overoptimise */
1438 unsigned long used;
1439
1440 /*
1441 * This error should be checked.
1442 */
1443
1444 if (sk->state == TCP_LISTEN)
1445 return -ENOTCONN;
1446
1447 /*
1448 * Urgent data needs to be handled specially.
1449 */
1450
1451 if (flags & MSG_OOB)
1452 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1453
1454 /*
1455 * Copying sequence to update. This is volatile to handle
1456 * the multi-reader case neatly (memcpy_to/fromfs might be
1457 * inline and thus not flush cached variables otherwise).
1458 */
1459
1460 peek_seq = sk->copied_seq;
1461 seq = &sk->copied_seq;
1462 if (flags & MSG_PEEK)
1463 seq = &peek_seq;
1464
1465 add_wait_queue(sk->sleep, &wait);
1466 lock_sock(sk);
1467 while (len > 0)
1468 {
1469 struct sk_buff * skb;
1470 u32 offset;
1471
1472 /*
1473 * Are we at urgent data? Stop if we have read anything.
1474 */
1475
1476 if (copied && sk->urg_data && sk->urg_seq == *seq)
1477 break;
1478
1479 /*
1480 * Next get a buffer.
1481 */
1482
1483 current->state = TASK_INTERRUPTIBLE;
1484
1485 skb = skb_peek(&sk->receive_queue);
1486 do
1487 {
1488 if (!skb)
1489 break;
1490 if (before(*seq, skb->seq))
1491 break;
1492 offset = *seq - skb->seq;
1493 if (skb->h.th->syn)
1494 offset--;
1495 if (offset < skb->len)
1496 goto found_ok_skb;
1497 if (skb->h.th->fin)
1498 goto found_fin_ok;
1499 if (!(flags & MSG_PEEK))
1500 skb->used = 1;
1501 skb = skb->next;
1502 }
1503 while (skb != (struct sk_buff *)&sk->receive_queue);
1504
1505 if (copied)
1506 break;
1507
1508 if (sk->err)
1509 {
1510 copied = sock_error(sk);
1511 break;
1512 }
1513
1514 if (sk->state == TCP_CLOSE)
1515 {
1516 if (!sk->done)
1517 {
1518 sk->done = 1;
1519 break;
1520 }
1521 copied = -ENOTCONN;
1522 break;
1523 }
1524
1525 if (sk->shutdown & RCV_SHUTDOWN)
1526 {
1527 sk->done = 1;
1528 break;
1529 }
1530
1531 if (nonblock)
1532 {
1533 copied = -EAGAIN;
1534 break;
1535 }
1536
1537 cleanup_rbuf(sk);
1538 release_sock(sk);
1539 sk->socket->flags |= SO_WAITDATA;
1540 schedule();
1541 sk->socket->flags &= ~SO_WAITDATA;
1542 lock_sock(sk);
1543
1544 if (current->signal & ~current->blocked)
1545 {
1546 copied = -ERESTARTSYS;
1547 break;
1548 }
1549 continue;
1550
1551 found_ok_skb:
1552 /*
1553 * Lock the buffer. We can be fairly relaxed as
1554 * an interrupt will never steal a buffer we are
1555 * using unless I've missed something serious in
1556 * tcp_data.
1557 */
1558
1559 skb->users++;
1560
1561 /*
1562 * Ok so how much can we use ?
1563 */
1564
1565 used = skb->len - offset;
1566 if (len < used)
1567 used = len;
1568 /*
1569 * Do we have urgent data here?
1570 */
1571
1572 if (sk->urg_data)
1573 {
1574 u32 urg_offset = sk->urg_seq - *seq;
1575 if (urg_offset < used)
1576 {
1577 if (!urg_offset)
1578 {
1579 if (!sk->urginline)
1580 {
1581 ++*seq;
1582 offset++;
1583 used--;
1584 }
1585 }
1586 else
1587 used = urg_offset;
1588 }
1589 }
1590
1591 /*
1592 * Copy it - We _MUST_ update *seq first so that we
1593 * don't ever double read when we have dual readers
1594 */
1595
1596 *seq += used;
1597
1598 /*
1599 * This memcpy_tofs can sleep. If it sleeps and we
1600 * do a second read it relies on the skb->users to avoid
1601 * a crash when cleanup_rbuf() gets called.
1602 */
1603
1604 memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
1605 skb->h.th->doff*4 + offset, used);
1606 copied += used;
1607 len -= used;
1608
1609 /*
1610 * We now will not sleep again until we are finished
1611 * with skb. Sorry if you are doing the SMP port
1612 * but you'll just have to fix it neatly ;)
1613 */
1614
1615 skb->users --;
1616
1617 if (after(sk->copied_seq,sk->urg_seq))
1618 sk->urg_data = 0;
1619 if (used + offset < skb->len)
1620 continue;
1621
1622 /*
1623 * Process the FIN.
1624 */
1625
1626 if (skb->h.th->fin)
1627 goto found_fin_ok;
1628 if (flags & MSG_PEEK)
1629 continue;
1630 skb->used = 1;
1631 if (!skb->users)
1632 tcp_eat_skb(sk, skb);
1633 continue;
1634
1635 found_fin_ok:
1636 ++*seq;
1637 if (flags & MSG_PEEK)
1638 break;
1639
1640 /*
1641 * All is done
1642 */
1643
1644 skb->used = 1;
1645 sk->shutdown |= RCV_SHUTDOWN;
1646 break;
1647
1648 }
1649
1650 if(copied>0 && msg->msg_name)
1651 {
1652 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
1653 sin->sin_family=AF_INET;
1654 sin->sin_addr.s_addr=sk->daddr;
1655 sin->sin_port=sk->dummy_th.dest;
1656 }
1657 if(addr_len)
1658 *addr_len=sizeof(struct sockaddr_in);
1659
1660 remove_wait_queue(sk->sleep, &wait);
1661 current->state = TASK_RUNNING;
1662
1663 /* Clean up data we have read: This will do ACK frames */
1664 cleanup_rbuf(sk);
1665 release_sock(sk);
1666 return copied;
1667 }
1668
1669
1670
1671 /*
1672 * State processing on a close. This implements the state shift for
1673 * sending our FIN frame. Note that we only send a FIN for some
1674 * states. A shutdown() may have already sent the FIN, or we may be
1675 * closed.
1676 */
1677
1678 static int tcp_close_state(struct sock *sk, int dead)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1679 {
1680 int ns=TCP_CLOSE;
1681 int send_fin=0;
1682 switch(sk->state)
1683 {
1684 case TCP_SYN_SENT: /* No SYN back, no FIN needed */
1685 break;
1686 case TCP_SYN_RECV:
1687 case TCP_ESTABLISHED: /* Closedown begin */
1688 ns=TCP_FIN_WAIT1;
1689 send_fin=1;
1690 break;
1691 case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
1692 case TCP_FIN_WAIT2:
1693 case TCP_CLOSING:
1694 ns=sk->state;
1695 break;
1696 case TCP_CLOSE:
1697 case TCP_LISTEN:
1698 break;
1699 case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
1700 wait only for the ACK */
1701 ns=TCP_LAST_ACK;
1702 send_fin=1;
1703 }
1704
1705 tcp_set_state(sk,ns);
1706
1707 /*
1708 * This is a (useful) BSD violating of the RFC. There is a
1709 * problem with TCP as specified in that the other end could
1710 * keep a socket open forever with no application left this end.
1711 * We use a 3 minute timeout (about the same as BSD) then kill
1712 * our end. If they send after that then tough - BUT: long enough
1713 * that we won't make the old 4*rto = almost no time - whoops
1714 * reset mistake.
1715 */
1716 if(dead && ns==TCP_FIN_WAIT2)
1717 {
1718 int timer_active=del_timer(&sk->timer);
1719 if(timer_active)
1720 add_timer(&sk->timer);
1721 else
1722 tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
1723 }
1724
1725 return send_fin;
1726 }
1727
1728 /*
1729 * Shutdown the sending side of a connection. Much like close except
1730 * that we don't receive shut down or set sk->dead.
1731 */
1732
1733 void tcp_shutdown(struct sock *sk, int how)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1734 {
1735 /*
1736 * We need to grab some memory, and put together a FIN,
1737 * and then put it into the queue to be sent.
1738 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1739 */
1740
1741 if (!(how & SEND_SHUTDOWN))
1742 return;
1743
1744 /*
1745 * If we've already sent a FIN, or it's a closed state
1746 */
1747
1748 if (sk->state == TCP_FIN_WAIT1 ||
1749 sk->state == TCP_FIN_WAIT2 ||
1750 sk->state == TCP_CLOSING ||
1751 sk->state == TCP_LAST_ACK ||
1752 sk->state == TCP_TIME_WAIT ||
1753 sk->state == TCP_CLOSE ||
1754 sk->state == TCP_LISTEN
1755 )
1756 {
1757 return;
1758 }
1759 lock_sock(sk);
1760
1761 /*
1762 * flag that the sender has shutdown
1763 */
1764
1765 sk->shutdown |= SEND_SHUTDOWN;
1766
1767 /*
1768 * Clear out any half completed packets.
1769 */
1770
1771 if (sk->partial)
1772 tcp_send_partial(sk);
1773
1774 /*
1775 * FIN if needed
1776 */
1777
1778 if (tcp_close_state(sk,0))
1779 tcp_send_fin(sk);
1780
1781 release_sock(sk);
1782 }
1783
1784
1785 /*
1786 * Return 1 if we still have things to send in our buffers.
1787 */
1788
1789 static inline int closing(struct sock * sk)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1790 {
1791 switch (sk->state) {
1792 case TCP_FIN_WAIT1:
1793 case TCP_CLOSING:
1794 case TCP_LAST_ACK:
1795 return 1;
1796 }
1797 return 0;
1798 }
1799
1800
1801 static void tcp_close(struct sock *sk, unsigned long timeout)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1802 {
1803 struct sk_buff *skb;
1804
1805 /*
1806 * We need to grab some memory, and put together a FIN,
1807 * and then put it into the queue to be sent.
1808 */
1809
1810 lock_sock(sk);
1811
1812 tcp_cache_zap();
1813 if(sk->state == TCP_LISTEN)
1814 {
1815 /* Special case */
1816 tcp_set_state(sk, TCP_CLOSE);
1817 tcp_close_pending(sk);
1818 release_sock(sk);
1819 sk->dead = 1;
1820 return;
1821 }
1822
1823 sk->keepopen = 1;
1824 sk->shutdown = SHUTDOWN_MASK;
1825
1826 if (!sk->dead)
1827 sk->state_change(sk);
1828
1829 /*
1830 * We need to flush the recv. buffs. We do this only on the
1831 * descriptor close, not protocol-sourced closes, because the
1832 * reader process may not have drained the data yet!
1833 */
1834
1835 while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
1836 kfree_skb(skb, FREE_READ);
1837
1838 /*
1839 * Get rid off any half-completed packets.
1840 */
1841
1842 if (sk->partial)
1843 tcp_send_partial(sk);
1844
1845 /*
1846 * Timeout is not the same thing - however the code likes
1847 * to send both the same way (sigh).
1848 */
1849
1850 if (tcp_close_state(sk,1)==1)
1851 {
1852 tcp_send_fin(sk);
1853 }
1854
1855 if (timeout) {
1856 cli();
1857 release_sock(sk);
1858 current->timeout = timeout;
1859 while(closing(sk) && current->timeout)
1860 {
1861 interruptible_sleep_on(sk->sleep);
1862 if (current->signal & ~current->blocked)
1863 {
1864 break;
1865 }
1866 }
1867 current->timeout=0;
1868 lock_sock(sk);
1869 sti();
1870 }
1871
1872 /*
1873 * This will destroy it. The timers will take care of actually
1874 * free'ing up the memory.
1875 */
1876 tcp_cache_zap(); /* Kill the cache again. */
1877 release_sock(sk);
1878 sk->dead = 1;
1879 }
1880
1881
1882 /*
1883 * This will accept the next outstanding connection.
1884 */
1885
1886 static struct sock *tcp_accept(struct sock *sk, int flags)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1887 {
1888 struct sock *newsk;
1889 struct sk_buff *skb;
1890
1891 /*
1892 * We need to make sure that this socket is listening,
1893 * and that it has something pending.
1894 */
1895
1896 if (sk->state != TCP_LISTEN)
1897 {
1898 sk->err = EINVAL;
1899 return(NULL);
1900 }
1901
1902 /* Avoid the race. */
1903 cli();
1904 lock_sock(sk);
1905
1906 while((skb = tcp_dequeue_established(sk)) == NULL)
1907 {
1908 if (flags & O_NONBLOCK)
1909 {
1910 sti();
1911 release_sock(sk);
1912 sk->err = EAGAIN;
1913 return(NULL);
1914 }
1915
1916 release_sock(sk);
1917 interruptible_sleep_on(sk->sleep);
1918 if (current->signal & ~current->blocked)
1919 {
1920 sti();
1921 sk->err = ERESTARTSYS;
1922 return(NULL);
1923 }
1924 lock_sock(sk);
1925 }
1926 sti();
1927
1928 /*
1929 * Now all we need to do is return skb->sk.
1930 */
1931
1932 newsk = skb->sk;
1933
1934 kfree_skb(skb, FREE_READ);
1935 sk->ack_backlog--;
1936 release_sock(sk);
1937 return(newsk);
1938 }
1939
1940 /*
1941 * This will initiate an outgoing connection.
1942 */
1943
1944 static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
1945 {
1946 struct sk_buff *buff;
1947 struct device *dev=NULL;
1948 unsigned char *ptr;
1949 int tmp;
1950 int atype;
1951 struct tcphdr *t1;
1952 struct rtable *rt;
1953
1954 if (sk->state != TCP_CLOSE)
1955 return(-EISCONN);
1956
1957 /*
1958 * Don't allow a double connect.
1959 */
1960
1961 if(sk->daddr)
1962 return -EINVAL;
1963
1964 if (addr_len < 8)
1965 return(-EINVAL);
1966
1967 if (usin->sin_family && usin->sin_family != AF_INET)
1968 return(-EAFNOSUPPORT);
1969
1970 /*
1971 * connect() to INADDR_ANY means loopback (BSD'ism).
1972 */
1973
1974 if(usin->sin_addr.s_addr==INADDR_ANY)
1975 usin->sin_addr.s_addr=ip_my_addr();
1976
1977 /*
1978 * Don't want a TCP connection going to a broadcast address
1979 */
1980
1981 if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
1982 return -ENETUNREACH;
1983
1984 lock_sock(sk);
1985 sk->daddr = usin->sin_addr.s_addr;
1986 sk->write_seq = tcp_init_seq();
1987 sk->window_seq = sk->write_seq;
1988 sk->rcv_ack_seq = sk->write_seq -1;
1989 sk->err = 0;
1990 sk->dummy_th.dest = usin->sin_port;
1991 release_sock(sk);
1992
1993 buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
1994 if (buff == NULL)
1995 {
1996 return(-ENOMEM);
1997 }
1998 lock_sock(sk);
1999 buff->sk = sk;
2000 buff->free = 0;
2001 buff->localroute = sk->localroute;
2002
2003
2004 /*
2005 * Put in the IP header and routing stuff.
2006 */
2007
2008 tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
2009 IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
2010 if (tmp < 0)
2011 {
2012 sock_wfree(sk, buff);
2013 release_sock(sk);
2014 return(-ENETUNREACH);
2015 }
2016 if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
2017 sk->saddr = rt->rt_src;
2018 sk->rcv_saddr = sk->saddr;
2019
2020 t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
2021
2022 memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
2023 buff->seq = sk->write_seq++;
2024 t1->seq = htonl(buff->seq);
2025 sk->sent_seq = sk->write_seq;
2026 buff->end_seq = sk->write_seq;
2027 t1->ack = 0;
2028 t1->window = 2;
2029 t1->syn = 1;
2030 t1->doff = 6;
2031 /* use 512 or whatever user asked for */
2032
2033 if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
2034 sk->window_clamp=rt->rt_window;
2035 else
2036 sk->window_clamp=0;
2037
2038 if (sk->user_mss)
2039 sk->mtu = sk->user_mss;
2040 else if (rt)
2041 sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
2042 else
2043 sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
2044
2045 /*
2046 * but not bigger than device MTU
2047 */
2048
2049 if(sk->mtu <32)
2050 sk->mtu = 32; /* Sanity limit */
2051
2052 sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
2053
2054 #ifdef CONFIG_SKIP
2055
2056 /*
2057 * SKIP devices set their MTU to 65535. This is so they can take packets
2058 * unfragmented to security process then fragment. They could lie to the
2059 * TCP layer about a suitable MTU, but its easier to let skip sort it out
2060 * simply because the final package we want unfragmented is going to be
2061 *
2062 * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
2063 */
2064
2065 if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
2066 sk->mtu=skip_pick_mtu(sk->mtu,dev);
2067 #endif
2068
2069 /*
2070 * Put in the TCP options to say MTU.
2071 */
2072
2073 ptr = skb_put(buff,4);
2074 ptr[0] = 2;
2075 ptr[1] = 4;
2076 ptr[2] = (sk->mtu) >> 8;
2077 ptr[3] = (sk->mtu) & 0xff;
2078 buff->csum = csum_partial(ptr, 4, 0);
2079 tcp_send_check(t1, sk->saddr, sk->daddr,
2080 sizeof(struct tcphdr) + 4, buff);
2081
2082 /*
2083 * This must go first otherwise a really quick response will get reset.
2084 */
2085
2086 tcp_cache_zap();
2087 tcp_set_state(sk,TCP_SYN_SENT);
2088 if(rt&&rt->rt_flags&RTF_IRTT)
2089 sk->rto = rt->rt_irtt;
2090 else
2091 sk->rto = TCP_TIMEOUT_INIT;
2092 sk->retransmit_timer.function=&tcp_retransmit_timer;
2093 sk->retransmit_timer.data = (unsigned long)sk;
2094 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */
2095 sk->retransmits = 0; /* Now works the right way instead of a hacked
2096 initial setting */
2097
2098 sk->prot->queue_xmit(sk, dev, buff, 0);
2099 tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
2100 tcp_statistics.TcpActiveOpens++;
2101 tcp_statistics.TcpOutSegs++;
2102
2103 release_sock(sk);
2104 return(0);
2105 }
2106
2107 /*
2108 * Socket option code for TCP.
2109 */
2110
2111 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2112 {
2113 int val,err;
2114
2115 if(level!=SOL_TCP)
2116 return ip_setsockopt(sk,level,optname,optval,optlen);
2117
2118 if (optval == NULL)
2119 return(-EINVAL);
2120
2121 err=verify_area(VERIFY_READ, optval, sizeof(int));
2122 if(err)
2123 return err;
2124
2125 val = get_user((int *)optval);
2126
2127 switch(optname)
2128 {
2129 case TCP_MAXSEG:
2130 /*
2131 * values greater than interface MTU won't take effect. however at
2132 * the point when this call is done we typically don't yet know
2133 * which interface is going to be used
2134 */
2135 if(val<1||val>MAX_WINDOW)
2136 return -EINVAL;
2137 sk->user_mss=val;
2138 return 0;
2139 case TCP_NODELAY:
2140 sk->nonagle=(val==0)?0:1;
2141 return 0;
2142 default:
2143 return(-ENOPROTOOPT);
2144 }
2145 }
2146
2147 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
2148 {
2149 int val,err;
2150
2151 if(level!=SOL_TCP)
2152 return ip_getsockopt(sk,level,optname,optval,optlen);
2153
2154 switch(optname)
2155 {
2156 case TCP_MAXSEG:
2157 val=sk->user_mss;
2158 break;
2159 case TCP_NODELAY:
2160 val=sk->nonagle;
2161 break;
2162 default:
2163 return(-ENOPROTOOPT);
2164 }
2165 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2166 if(err)
2167 return err;
2168 put_user(sizeof(int),(int *) optlen);
2169
2170 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2171 if(err)
2172 return err;
2173 put_user(val,(int *)optval);
2174
2175 return(0);
2176 }
2177
2178
2179 struct proto tcp_prot = {
2180 tcp_close,
2181 ip_build_header,
2182 tcp_connect,
2183 tcp_accept,
2184 ip_queue_xmit,
2185 tcp_retransmit,
2186 tcp_write_wakeup,
2187 tcp_read_wakeup,
2188 tcp_rcv,
2189 tcp_select,
2190 tcp_ioctl,
2191 NULL,
2192 tcp_shutdown,
2193 tcp_setsockopt,
2194 tcp_getsockopt,
2195 tcp_sendmsg,
2196 tcp_recvmsg,
2197 NULL, /* No special bind() */
2198 128,
2199 0,
2200 "TCP",
2201 0, 0,
2202 {NULL,}
2203 };