1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * The Internet Protocol (IP) module. 7 * 8 * Version: @(#)ip.c 1.0.16b 9/1/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Donald Becker, <becker@super.org> 13 * Alan Cox, <gw4pts@gw4pts.ampr.org> 14 * Richard Underwood 15 * Stefan Becker, <stefanb@yello.ping.de> 16 * Jorge Cwik, <jorge@laser.satlink.net> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 18 * 19 * 20 * Fixes: 21 * Alan Cox : Commented a couple of minor bits of surplus code 22 * Alan Cox : Undefining IP_FORWARD doesn't include the code 23 * (just stops a compiler warning). 24 * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes 25 * are junked rather than corrupting things. 26 * Alan Cox : Frames to bad broadcast subnets are dumped 27 * We used to process them non broadcast and 28 * boy could that cause havoc. 29 * Alan Cox : ip_forward sets the free flag on the 30 * new frame it queues. Still crap because 31 * it copies the frame but at least it 32 * doesn't eat memory too. 33 * Alan Cox : Generic queue code and memory fixes. 34 * Fred Van Kempen : IP fragment support (borrowed from NET2E) 35 * Gerhard Koerting: Forward fragmented frames correctly. 36 * Gerhard Koerting: Fixes to my fix of the above 8-). 37 * Gerhard Koerting: IP interface addressing fix. 38 * Linus Torvalds : More robustness checks 39 * Alan Cox : Even more checks: Still not as robust as it ought to be 40 * Alan Cox : Save IP header pointer for later 41 * Alan Cox : ip option setting 42 * Alan Cox : Use ip_tos/ip_ttl settings 43 * Alan Cox : Fragmentation bogosity removed 44 * (Thanks to Mark.Bush@prg.ox.ac.uk) 45 * Dmitry Gorodchanin : Send of a raw packet crash fix. 46 * Alan Cox : Silly ip bug when an overlength 47 * fragment turns up. Now frees the 48 * queue. 49 * Linus Torvalds/ : Memory leakage on fragmentation 50 * Alan Cox : handling. 51 * Gerhard Koerting: Forwarding uses IP priority hints 52 * Teemu Rantanen : Fragment problems. 53 * Alan Cox : General cleanup, comments and reformat 54 * Alan Cox : SNMP statistics 55 * Alan Cox : BSD address rule semantics. Also see 56 * UDP as there is a nasty checksum issue 57 * if you do things the wrong way. 58 * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file 59 * Alan Cox : IP options adjust sk->priority. 60 * Pedro Roque : Fix mtu/length error in ip_forward. 61 * Alan Cox : Avoid ip_chk_addr when possible. 62 * Richard Underwood : IP multicasting. 63 * Alan Cox : Cleaned up multicast handlers. 64 * Alan Cox : RAW sockets demultiplex in the BSD style. 65 * Gunther Mayer : Fix the SNMP reporting typo 66 * Alan Cox : Always in group 224.0.0.1 67 * Pauline Middelink : Fast ip_checksum update when forwarding 68 * Masquerading support. 69 * Alan Cox : Multicast loopback error for 224.0.0.1 70 * Alan Cox : IP_MULTICAST_LOOP option. 71 * Alan Cox : Use notifiers. 72 * Bjorn Ekwall : Removed ip_csum (from slhc.c too) 73 * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) 74 * Stefan Becker : Send out ICMP HOST REDIRECT 75 * Arnt Gulbrandsen : ip_build_xmit 76 * Alan Cox : Per socket routing cache 77 * Alan Cox : Fixed routing cache, added header cache. 78 * Alan Cox : Loopback didnt work right in original ip_build_xmit - fixed it. 79 * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. 80 * Alan Cox : Incoming IP option handling. 81 * Alan Cox : Set saddr on raw output frames as per BSD. 82 * Alan Cox : Stopped broadcast source route explosions. 83 * Alan Cox : Can disable source routing 84 * Takeshi Sone : Masquerading didn't work. 85 * 86 * 87 * 88 * To Fix: 89 * IP option processing is mostly not needed. ip_forward needs to know about routing rules 90 * and time stamp but that's about all. Use the route mtu field here too 91 * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient 92 * and could be made very efficient with the addition of some virtual memory hacks to permit 93 * the allocation of a buffer that can then be 'grown' by twiddling page tables. 94 * Output fragmentation wants updating along with the buffer management to use a single 95 * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet 96 * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause 97 * fragmentation anyway. 98 * 99 * FIXME: copy frag 0 iph to qp->iph 100 * 101 * This program is free software; you can redistribute it and/or 102 * modify it under the terms of the GNU General Public License 103 * as published by the Free Software Foundation; either version 104 * 2 of the License, or (at your option) any later version. 105 */ 106
107 #include <asm/segment.h>
108 #include <asm/system.h>
109 #include <linux/types.h>
110 #include <linux/kernel.h>
111 #include <linux/sched.h>
112 #include <linux/mm.h>
113 #include <linux/string.h>
114 #include <linux/errno.h>
115 #include <linux/config.h>
116
117 #include <linux/socket.h>
118 #include <linux/sockios.h>
119 #include <linux/in.h>
120 #include <linux/inet.h>
121 #include <linux/netdevice.h>
122 #include <linux/etherdevice.h>
123
124 #include <net/snmp.h>
125 #include <net/ip.h>
126 #include <net/protocol.h>
127 #include <net/route.h>
128 #include <net/tcp.h>
129 #include <net/udp.h>
130 #include <linux/skbuff.h>
131 #include <net/sock.h>
132 #include <net/arp.h>
133 #include <net/icmp.h>
134 #include <net/raw.h>
135 #include <net/checksum.h>
136 #include <linux/igmp.h>
137 #include <linux/ip_fw.h>
138
139 #define CONFIG_IP_DEFRAG
140
141 externintlast_retran;
142 externvoid sort_send(structsock *sk);
143
144 #definemin(a,b) ((a)<(b)?(a):(b))
145 #defineLOOPBACK(x) (((x) & htonl(0xff000000)) == htonl(0x7f000000))
146
147 /* 148 * SNMP management statistics 149 */ 150
151 #ifdefCONFIG_IP_FORWARD 152 structip_mibip_statistics={1,64,}; /* Forwarding=Yes, Default TTL=64 */ 153 #else 154 structip_mibip_statistics={0,64,}; /* Forwarding=No, Default TTL=64 */ 155 #endif 156
157 /* 158 * Handle the issuing of an ioctl() request 159 * for the ip device. This is scheduled to 160 * disappear 161 */ 162
163 intip_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 164 { 165 switch(cmd)
166 { 167 default:
168 return(-EINVAL);
169 } 170 } 171
172
173 /* 174 * Take an skb, and fill in the MAC header. 175 */ 176
177 staticintip_send(structsk_buff *skb, unsignedlongdaddr, intlen, structdevice *dev, unsignedlongsaddr)
/* */ 178 { 179 intmac = 0;
180
181 skb->dev = dev;
182 skb->arp = 1;
183 if (dev->hard_header)
184 { 185 /* 186 * Build a hardware header. Source address is our mac, destination unknown 187 * (rebuild header will sort this out) 188 */ 189 skb_reserve(skb,(dev->hard_header_len+15)&~15); /* 16 byte aligned IP headers are good */ 190 mac = dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, len);
191 if (mac < 0)
192 { 193 mac = -mac;
194 skb->arp = 0;
195 skb->raddr = daddr; /* next routing address */ 196 } 197 } 198 returnmac;
199 } 200
201 intip_id_count = 0;
202
203 /* 204 * This routine builds the appropriate hardware/IP headers for 205 * the routine. It assumes that if *dev != NULL then the 206 * protocol knows what it's doing, otherwise it uses the 207 * routing/ARP tables to select a device struct. 208 */ 209 intip_build_header(structsk_buff *skb, unsignedlongsaddr, unsignedlongdaddr,
/* */ 210 structdevice **dev, inttype, structoptions *opt, intlen, inttos, intttl)
211 { 212 structrtable *rt;
213 unsignedlongraddr;
214 inttmp;
215 unsignedlongsrc;
216 structiphdr *iph;
217
218 /* 219 * See if we need to look up the device. 220 */ 221
222 #ifdefCONFIG_INET_MULTICAST 223 if(MULTICAST(daddr) && *dev==NULL && skb->sk && *skb->sk->ip_mc_name)
224 *dev=dev_get(skb->sk->ip_mc_name);
225 #endif 226 if (*dev == NULL)
227 { 228 if(skb->localroute)
229 rt = ip_rt_local(daddr, NULL, &src);
230 else 231 rt = ip_rt_route(daddr, NULL, &src);
232 if (rt == NULL)
233 { 234 ip_statistics.IpOutNoRoutes++;
235 return(-ENETUNREACH);
236 } 237
238 *dev = rt->rt_dev;
239 /* 240 * If the frame is from us and going off machine it MUST MUST MUST 241 * have the output device ip address and never the loopback 242 */ 243 if (LOOPBACK(saddr) && !LOOPBACK(daddr))
244 saddr = src;/*rt->rt_dev->pa_addr;*/ 245 raddr = rt->rt_gateway;
246
247 } 248 else 249 { 250 /* 251 * We still need the address of the first hop. 252 */ 253 if(skb->localroute)
254 rt = ip_rt_local(daddr, NULL, &src);
255 else 256 rt = ip_rt_route(daddr, NULL, &src);
257 /* 258 * If the frame is from us and going off machine it MUST MUST MUST 259 * have the output device ip address and never the loopback 260 */ 261 if (LOOPBACK(saddr) && !LOOPBACK(daddr))
262 saddr = src;/*rt->rt_dev->pa_addr;*/ 263
264 raddr = (rt == NULL) ? 0 : rt->rt_gateway;
265 } 266
267 /* 268 * No source addr so make it our addr 269 */ 270 if (saddr == 0)
271 saddr = src;
272
273 /* 274 * No gateway so aim at the real destination 275 */ 276 if (raddr == 0)
277 raddr = daddr;
278
279 /* 280 * Now build the MAC header. 281 */ 282
283 tmp = ip_send(skb, raddr, len, *dev, saddr);
284
285 /* 286 * Book keeping 287 */ 288
289 skb->dev = *dev;
290 skb->saddr = saddr;
291 if (skb->sk)
292 skb->sk->saddr = saddr;
293
294 /* 295 * Now build the IP header. 296 */ 297
298 /* 299 * If we are using IPPROTO_RAW, then we don't need an IP header, since 300 * one is being supplied to us by the user 301 */ 302
303 if(type == IPPROTO_RAW)
304 return (tmp);
305
306 /* 307 * Build the IP addresses 308 */ 309
310 iph=(structiphdr *)skb_put(skb,sizeof(structiphdr));
311
312 iph->version = 4;
313 iph->ihl = 5;
314 iph->tos = tos;
315 iph->frag_off = 0;
316 iph->ttl = ttl;
317 iph->daddr = daddr;
318 iph->saddr = saddr;
319 iph->protocol = type;
320 skb->ip_hdr = iph;
321
322 return(20 + tmp); /* IP header plus MAC header size */ 323 } 324
325
326 /* 327 * Generate a checksum for an outgoing IP datagram. 328 */ 329
330 voidip_send_check(structiphdr *iph)
/* */ 331 { 332 iph->check = 0;
333 iph->check = ip_fast_csum((unsignedchar *)iph, iph->ihl);
334 } 335
336 /************************ Fragment Handlers From NET2E **********************************/ 337
338
339 /* 340 * This fragment handler is a bit of a heap. On the other hand it works quite 341 * happily and handles things quite well. 342 */ 343
344 staticstructipq *ipqueue = NULL; /* IP fragment queue */ 345
346 /* 347 * Create a new fragment entry. 348 */ 349
350 staticstructipfrag *ip_frag_create(intoffset, intend, structsk_buff *skb, unsignedchar *ptr)
/* */ 351 { 352 structipfrag *fp;
353
354 fp = (structipfrag *) kmalloc(sizeof(structipfrag), GFP_ATOMIC);
355 if (fp == NULL)
356 { 357 NETDEBUG(printk("IP: frag_create: no memory left !\n"));
358 return(NULL);
359 } 360 memset(fp, 0, sizeof(structipfrag));
361
362 /* Fill in the structure. */ 363 fp->offset = offset;
364 fp->end = end;
365 fp->len = end - offset;
366 fp->skb = skb;
367 fp->ptr = ptr;
368
369 return(fp);
370 } 371
372
373 /* 374 * Find the correct entry in the "incomplete datagrams" queue for 375 * this IP datagram, and return the queue entry address if found. 376 */ 377
378 staticstructipq *ip_find(structiphdr *iph)
/* */ 379 { 380 structipq *qp;
381 structipq *qplast;
382
383 cli();
384 qplast = NULL;
385 for(qp = ipqueue; qp != NULL; qplast = qp, qp = qp->next)
386 { 387 if (iph->id== qp->iph->id && iph->saddr == qp->iph->saddr &&
388 iph->daddr == qp->iph->daddr && iph->protocol == qp->iph->protocol)
389 { 390 del_timer(&qp->timer); /* So it doesn't vanish on us. The timer will be reset anyway */ 391 sti();
392 return(qp);
393 } 394 } 395 sti();
396 return(NULL);
397 } 398
399
400 /* 401 * Remove an entry from the "incomplete datagrams" queue, either 402 * because we completed, reassembled and processed it, or because 403 * it timed out. 404 */ 405
406 staticvoidip_free(structipq *qp)
/* */ 407 { 408 structipfrag *fp;
409 structipfrag *xp;
410
411 /* 412 * Stop the timer for this entry. 413 */ 414
415 del_timer(&qp->timer);
416
417 /* Remove this entry from the "incomplete datagrams" queue. */ 418 cli();
419 if (qp->prev == NULL)
420 { 421 ipqueue = qp->next;
422 if (ipqueue != NULL)
423 ipqueue->prev = NULL;
424 } 425 else 426 { 427 qp->prev->next = qp->next;
428 if (qp->next != NULL)
429 qp->next->prev = qp->prev;
430 } 431
432 /* Release all fragment data. */ 433
434 fp = qp->fragments;
435 while (fp != NULL)
436 { 437 xp = fp->next;
438 IS_SKB(fp->skb);
439 kfree_skb(fp->skb,FREE_READ);
440 kfree_s(fp, sizeof(structipfrag));
441 fp = xp;
442 } 443
444 /* Release the IP header. */ 445 kfree_s(qp->iph, 64 + 8);
446
447 /* Finally, release the queue descriptor itself. */ 448 kfree_s(qp, sizeof(structipq));
449 sti();
450 } 451
452
453 /* 454 * Oops- a fragment queue timed out. Kill it and send an ICMP reply. 455 */ 456
457 staticvoidip_expire(unsignedlongarg)
/* */ 458 { 459 structipq *qp;
460
461 qp = (structipq *)arg;
462
463 /* 464 * Send an ICMP "Fragment Reassembly Timeout" message. 465 */ 466
467 ip_statistics.IpReasmTimeout++;
468 ip_statistics.IpReasmFails++;
469 /* This if is always true... shrug */ 470 if(qp->fragments!=NULL)
471 icmp_send(qp->fragments->skb,ICMP_TIME_EXCEEDED,
472 ICMP_EXC_FRAGTIME, 0, qp->dev);
473
474 /* 475 * Nuke the fragment queue. 476 */ 477 ip_free(qp);
478 } 479
480
481 /* 482 * Add an entry to the 'ipq' queue for a newly received IP datagram. 483 * We will (hopefully :-) receive all other fragments of this datagram 484 * in time, so we just create a queue for this datagram, in which we 485 * will insert the received fragments at their respective positions. 486 */ 487
488 staticstructipq *ip_create(structsk_buff *skb, structiphdr *iph, structdevice *dev)
/* */ 489 { 490 structipq *qp;
491 intihlen;
492
493 qp = (structipq *) kmalloc(sizeof(structipq), GFP_ATOMIC);
494 if (qp == NULL)
495 { 496 NETDEBUG(printk("IP: create: no memory left !\n"));
497 return(NULL);
498 skb->dev = qp->dev;
499 } 500 memset(qp, 0, sizeof(structipq));
501
502 /* 503 * Allocate memory for the IP header (plus 8 octets for ICMP). 504 */ 505
506 ihlen = iph->ihl * 4;
507 qp->iph = (structiphdr *) kmalloc(64 + 8, GFP_ATOMIC);
508 if (qp->iph == NULL)
509 { 510 NETDEBUG(printk("IP: create: no memory left !\n"));
511 kfree_s(qp, sizeof(structipq));
512 return(NULL);
513 } 514
515 memcpy(qp->iph, iph, ihlen + 8);
516 qp->len = 0;
517 qp->ihlen = ihlen;
518 qp->fragments = NULL;
519 qp->dev = dev;
520
521 /* Start a timer for this entry. */ 522 qp->timer.expires = IP_FRAG_TIME; /* about 30 seconds */ 523 qp->timer.data = (unsignedlong) qp; /* pointer to queue */ 524 qp->timer.function = ip_expire; /* expire function */ 525 add_timer(&qp->timer);
526
527 /* Add this entry to the queue. */ 528 qp->prev = NULL;
529 cli();
530 qp->next = ipqueue;
531 if (qp->next != NULL)
532 qp->next->prev = qp;
533 ipqueue = qp;
534 sti();
535 return(qp);
536 } 537
538
539 /* 540 * See if a fragment queue is complete. 541 */ 542
543 staticintip_done(structipq *qp)
/* */ 544 { 545 structipfrag *fp;
546 intoffset;
547
548 /* Only possible if we received the final fragment. */ 549 if (qp->len == 0)
550 return(0);
551
552 /* Check all fragment offsets to see if they connect. */ 553 fp = qp->fragments;
554 offset = 0;
555 while (fp != NULL)
556 { 557 if (fp->offset > offset)
558 return(0); /* fragment(s) missing */ 559 offset = fp->end;
560 fp = fp->next;
561 } 562
563 /* All fragments are present. */ 564 return(1);
565 } 566
567
568 /* 569 * Build a new IP datagram from all its fragments. 570 * 571 * FIXME: We copy here because we lack an effective way of handling lists 572 * of bits on input. Until the new skb data handling is in I'm not going 573 * to touch this with a bargepole. 574 */ 575
576 staticstructsk_buff *ip_glue(structipq *qp)
/* */ 577 { 578 structsk_buff *skb;
579 structiphdr *iph;
580 structipfrag *fp;
581 unsignedchar *ptr;
582 intcount, len;
583
584 /* 585 * Allocate a new buffer for the datagram. 586 */ 587 len = qp->ihlen + qp->len;
588
589 if ((skb = dev_alloc_skb(len)) == NULL)
590 { 591 ip_statistics.IpReasmFails++;
592 NETDEBUG(printk("IP: queue_glue: no memory for gluing queue %p\n", qp));
593 ip_free(qp);
594 return(NULL);
595 } 596
597 /* Fill in the basic details. */ 598 skb_put(skb,len);
599 skb->h.raw = skb->data;
600 skb->free = 1;
601
602 /* Copy the original IP headers into the new buffer. */ 603 ptr = (unsignedchar *) skb->h.raw;
604 memcpy(ptr, ((unsignedchar *) qp->iph), qp->ihlen);
605 ptr += qp->ihlen;
606
607 count = 0;
608
609 /* Copy the data portions of all fragments into the new buffer. */ 610 fp = qp->fragments;
611 while(fp != NULL)
612 { 613 if(count+fp->len > skb->len)
614 { 615 NETDEBUG(printk("Invalid fragment list: Fragment over size.\n"));
616 ip_free(qp);
617 kfree_skb(skb,FREE_WRITE);
618 ip_statistics.IpReasmFails++;
619 returnNULL;
620 } 621 memcpy((ptr + fp->offset), fp->ptr, fp->len);
622 count += fp->len;
623 fp = fp->next;
624 } 625
626 /* We glued together all fragments, so remove the queue entry. */ 627 ip_free(qp);
628
629 /* Done with all fragments. Fixup the new IP header. */ 630 iph = skb->h.iph;
631 iph->frag_off = 0;
632 iph->tot_len = htons((iph->ihl * 4) + count);
633 skb->ip_hdr = iph;
634
635 ip_statistics.IpReasmOKs++;
636 return(skb);
637 } 638
639
640 /* 641 * Process an incoming IP datagram fragment. 642 */ 643
644 staticstructsk_buff *ip_defrag(structiphdr *iph, structsk_buff *skb, structdevice *dev)
/* */ 645 { 646 structipfrag *prev, *next, *tmp;
647 structipfrag *tfp;
648 structipq *qp;
649 structsk_buff *skb2;
650 unsignedchar *ptr;
651 intflags, offset;
652 inti, ihl, end;
653
654 ip_statistics.IpReasmReqds++;
655
656 /* Find the entry of this IP datagram in the "incomplete datagrams" queue. */ 657 qp = ip_find(iph);
658
659 /* Is this a non-fragmented datagram? */ 660 offset = ntohs(iph->frag_off);
661 flags = offset & ~IP_OFFSET;
662 offset &= IP_OFFSET;
663 if (((flags & IP_MF) == 0) && (offset == 0))
664 { 665 if (qp != NULL)
666 ip_free(qp); /* Huh? How could this exist?? */ 667 return(skb);
668 } 669
670 offset <<= 3; /* offset is in 8-byte chunks */ 671
672 /* 673 * If the queue already existed, keep restarting its timer as long 674 * as we still are receiving fragments. Otherwise, create a fresh 675 * queue entry. 676 */ 677
678 if (qp != NULL)
679 { 680 del_timer(&qp->timer);
681 qp->timer.expires = IP_FRAG_TIME; /* about 30 seconds */ 682 qp->timer.data = (unsignedlong) qp; /* pointer to queue */ 683 qp->timer.function = ip_expire; /* expire function */ 684 add_timer(&qp->timer);
685 } 686 else 687 { 688 /* 689 * If we failed to create it, then discard the frame 690 */ 691 if ((qp = ip_create(skb, iph, dev)) == NULL)
692 { 693 skb->sk = NULL;
694 kfree_skb(skb, FREE_READ);
695 ip_statistics.IpReasmFails++;
696 returnNULL;
697 } 698 } 699
700 /* 701 * Determine the position of this fragment. 702 */ 703
704 ihl = iph->ihl * 4;
705 end = offset + ntohs(iph->tot_len) - ihl;
706
707 /* 708 * Point into the IP datagram 'data' part. 709 */ 710
711 ptr = skb->data + ihl;
712
713 /* 714 * Is this the final fragment? 715 */ 716
717 if ((flags & IP_MF) == 0)
718 qp->len = end;
719
720 /* 721 * Find out which fragments are in front and at the back of us 722 * in the chain of fragments so far. We must know where to put 723 * this fragment, right? 724 */ 725
726 prev = NULL;
727 for(next = qp->fragments; next != NULL; next = next->next)
728 { 729 if (next->offset > offset)
730 break; /* bingo! */ 731 prev = next;
732 } 733
734 /* 735 * We found where to put this one. 736 * Check for overlap with preceding fragment, and, if needed, 737 * align things so that any overlaps are eliminated. 738 */ 739 if (prev != NULL && offset < prev->end)
740 { 741 i = prev->end - offset;
742 offset += i; /* ptr into datagram */ 743 ptr += i; /* ptr into fragment data */ 744 } 745
746 /* 747 * Look for overlap with succeeding segments. 748 * If we can merge fragments, do it. 749 */ 750
751 for(tmp=next; tmp != NULL; tmp = tfp)
752 { 753 tfp = tmp->next;
754 if (tmp->offset >= end)
755 break; /* no overlaps at all */ 756
757 i = end - next->offset; /* overlap is 'i' bytes */ 758 tmp->len -= i; /* so reduce size of */ 759 tmp->offset += i; /* next fragment */ 760 tmp->ptr += i;
761 /* 762 * If we get a frag size of <= 0, remove it and the packet 763 * that it goes with. 764 */ 765 if (tmp->len <= 0)
766 { 767 if (tmp->prev != NULL)
768 tmp->prev->next = tmp->next;
769 else 770 qp->fragments = tmp->next;
771
772 if (tfp->next != NULL)
773 tmp->next->prev = tmp->prev;
774
775 next=tfp; /* We have killed the original next frame */ 776
777 kfree_skb(tmp->skb,FREE_READ);
778 kfree_s(tmp, sizeof(structipfrag));
779 } 780 } 781
782 /* 783 * Insert this fragment in the chain of fragments. 784 */ 785
786 tfp = NULL;
787 tfp = ip_frag_create(offset, end, skb, ptr);
788
789 /* 790 * No memory to save the fragment - so throw the lot 791 */ 792
793 if (!tfp)
794 { 795 skb->sk = NULL;
796 kfree_skb(skb, FREE_READ);
797 returnNULL;
798 } 799 tfp->prev = prev;
800 tfp->next = next;
801 if (prev != NULL)
802 prev->next = tfp;
803 else 804 qp->fragments = tfp;
805
806 if (next != NULL)
807 next->prev = tfp;
808
809 /* 810 * OK, so we inserted this new fragment into the chain. 811 * Check if we now have a full IP datagram which we can 812 * bump up to the IP layer... 813 */ 814
815 if (ip_done(qp))
816 { 817 skb2 = ip_glue(qp); /* glue together the fragments */ 818 return(skb2);
819 } 820 return(NULL);
821 } 822
823
824 /* 825 * This IP datagram is too large to be sent in one piece. Break it up into 826 * smaller pieces (each of size equal to the MAC header plus IP header plus 827 * a block of the data of the original IP data part) that will yet fit in a 828 * single device frame, and queue such a frame for sending by calling the 829 * ip_queue_xmit(). Note that this is recursion, and bad things will happen 830 * if this function causes a loop... 831 * 832 * Yes this is inefficient, feel free to submit a quicker one. 833 * 834 * **Protocol Violation** 835 * We copy all the options to each fragment. !FIXME! 836 */ 837
838 voidip_fragment(structsock *sk, structsk_buff *skb, structdevice *dev, intis_frag)
/* */ 839 { 840 structiphdr *iph;
841 unsignedchar *raw;
842 unsignedchar *ptr;
843 structsk_buff *skb2;
844 intleft, mtu, hlen, len;
845 intoffset;
846 unsignedlongflags;
847
848 /* 849 * Point into the IP datagram header. 850 */ 851
852 raw = skb->data;
853 iph = (structiphdr *) (raw + dev->hard_header_len);
854
855 skb->ip_hdr = iph;
856
857 /* 858 * Setup starting values. 859 */ 860
861 hlen = iph->ihl * 4;
862 left = ntohs(iph->tot_len) - hlen; /* Space per frame */ 863 hlen += dev->hard_header_len; /* Total header size */ 864 mtu = (dev->mtu - hlen); /* Size of data space */ 865 ptr = (raw + hlen); /* Where to start from */ 866
867 /* 868 * Check for any "DF" flag. [DF means do not fragment] 869 */ 870
871 if (ntohs(iph->frag_off) & IP_DF)
872 { 873 /* 874 * Reply giving the MTU of the failed hop. 875 */ 876 ip_statistics.IpFragFails++;
877 icmp_send(skb,ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, dev->mtu, dev);
878 return;
879 } 880
881 /* 882 * The protocol doesn't seem to say what to do in the case that the 883 * frame + options doesn't fit the mtu. As it used to fall down dead 884 * in this case we were fortunate it didn't happen 885 */ 886
887 if(mtu<8)
888 { 889 /* It's wrong but it's better than nothing */ 890 icmp_send(skb,ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED,dev->mtu, dev);
891 ip_statistics.IpFragFails++;
892 return;
893 } 894
895 /* 896 * Fragment the datagram. 897 */ 898
899 /* 900 * The initial offset is 0 for a complete frame. When 901 * fragmenting fragments it's wherever this one starts. 902 */ 903
904 if (is_frag & 2)
905 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
906 else 907 offset = 0;
908
909
910 /* 911 * Keep copying data until we run out. 912 */ 913
914 while(left > 0)
915 { 916 len = left;
917 /* IF: it doesn't fit, use 'mtu' - the data space left */ 918 if (len > mtu)
919 len = mtu;
920 /* IF: we are not sending upto and including the packet end 921 then align the next start on an eight byte boundary */ 922 if (len < left)
923 { 924 len/=8;
925 len*=8;
926 } 927 /* 928 * Allocate buffer. 929 */ 930
931 if ((skb2 = alloc_skb(len + hlen+15,GFP_ATOMIC)) == NULL)
932 { 933 NETDEBUG(printk("IP: frag: no memory for new fragment!\n"));
934 ip_statistics.IpFragFails++;
935 return;
936 } 937
938 /* 939 * Set up data on packet 940 */ 941
942 skb2->arp = skb->arp;
943 if(skb->free==0)
944 printk("IP fragmenter: BUG free!=1 in fragmenter\n");
945 skb2->free = 1;
946 skb_put(skb2,len + hlen);
947 skb2->h.raw=(char *) skb2->data;
948 /* 949 * Charge the memory for the fragment to any owner 950 * it might possess 951 */ 952
953 save_flags(flags);
954 if (sk)
955 { 956 cli();
957 sk->wmem_alloc += skb2->truesize;
958 skb2->sk=sk;
959 } 960 restore_flags(flags);
961 skb2->raddr = skb->raddr; /* For rebuild_header - must be here */ 962
963 /* 964 * Copy the packet header into the new buffer. 965 */ 966
967 memcpy(skb2->h.raw, raw, hlen);
968
969 /* 970 * Copy a block of the IP datagram. 971 */ 972 memcpy(skb2->h.raw + hlen, ptr, len);
973 left -= len;
974
975 skb2->h.raw+=dev->hard_header_len;
976
977 /* 978 * Fill in the new header fields. 979 */ 980 iph = (structiphdr *)(skb2->h.raw/*+dev->hard_header_len*/);
981 iph->frag_off = htons((offset >> 3));
982 /* 983 * Added AC : If we are fragmenting a fragment thats not the 984 * last fragment then keep MF on each bit 985 */ 986 if (left > 0 || (is_frag & 1))
987 iph->frag_off |= htons(IP_MF);
988 ptr += len;
989 offset += len;
990
991 /* 992 * Put this fragment into the sending queue. 993 */ 994
995 ip_statistics.IpFragCreates++;
996
997 ip_queue_xmit(sk, dev, skb2, 2);
998 } 999 ip_statistics.IpFragOKs++;
1000 }1001
1002
1003
1004 #ifdefCONFIG_IP_FORWARD1005
1006 /*1007 * Forward an IP datagram to its next destination.1008 */1009
1010 voidip_forward(structsk_buff *skb, structdevice *dev, intis_frag, unsignedlongtarget_addr, inttarget_strict)
/* */1011 {1012 structdevice *dev2; /* Output device */1013 structiphdr *iph; /* Our header */1014 structsk_buff *skb2; /* Output packet */1015 structrtable *rt; /* Route we use */1016 unsignedchar *ptr; /* Data pointer */1017 unsignedlongraddr; /* Router IP address */1018 #ifdefCONFIG_IP_FIREWALL1019 intfw_res = 0; /* Forwarding result */1020
1021 /* 1022 * See if we are allowed to forward this.1023 * Note: demasqueraded fragments are always 'back'warded.1024 */1025
1026
1027 if(!(is_frag&4))
1028 {1029 fw_res=ip_fw_chk(skb->h.iph, dev, ip_fw_fwd_chain, ip_fw_fwd_policy, 0);
1030 switch (fw_res) {1031 case 1:
1032 #ifdefCONFIG_IP_MASQUERADE1033 case 2:
1034 #endif1035 break;
1036 case -1:
1037 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev);
1038 /* fall thru */1039 default:
1040 return;
1041 }1042 }1043 #endif1044 /*1045 * According to the RFC, we must first decrease the TTL field. If1046 * that reaches zero, we must reply an ICMP control message telling1047 * that the packet's lifetime expired.1048 *1049 * Exception:1050 * We may not generate an ICMP for an ICMP. icmp_send does the1051 * enforcement of this so we can forget it here. It is however1052 * sometimes VERY important.1053 */1054
1055 iph = skb->h.iph;
1056 iph->ttl--;
1057
1058 /*1059 * Re-compute the IP header checksum.1060 * This is inefficient. We know what has happened to the header1061 * and could thus adjust the checksum as Phil Karn does in KA9Q1062 */1063
1064 iph->check = ntohs(iph->check) + 0x0100;
1065 if ((iph->check & 0xFF00) == 0)
1066 iph->check++; /* carry overflow */1067 iph->check = htons(iph->check);
1068
1069 if (iph->ttl <= 0)
1070 {1071 /* Tell the sender its packet died... */1072 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0, dev);
1073 return;
1074 }1075
1076 /*1077 * OK, the packet is still valid. Fetch its destination address,1078 * and give it to the IP sender for further processing.1079 */1080
1081 rt = ip_rt_route(target_addr, NULL, NULL);
1082 if (rt == NULL)
1083 {1084 /*1085 * Tell the sender its packet cannot be delivered. Again1086 * ICMP is screened later.1087 */1088 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0, dev);
1089 return;
1090 }1091
1092
1093 /*1094 * Gosh. Not only is the packet valid; we even know how to1095 * forward it onto its final destination. Can we say this1096 * is being plain lucky?1097 * If the router told us that there is no GW, use the dest.1098 * IP address itself- we seem to be connected directly...1099 */1100
1101 raddr = rt->rt_gateway;
1102
1103 if (raddr != 0)
1104 {1105 /*1106 * Strict routing permits no gatewaying1107 */1108
1109 if(target_strict)
1110 {1111 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0, dev);
1112 kfree_skb(skb, FREE_READ);
1113 return;
1114 }1115
1116 /*1117 * There is a gateway so find the correct route for it.1118 * Gateways cannot in turn be gatewayed.1119 */1120
1121 rt = ip_rt_route(raddr, NULL, NULL);
1122 if (rt == NULL)
1123 {1124 /*1125 * Tell the sender its packet cannot be delivered...1126 */1127 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev);
1128 return;
1129 }1130 if (rt->rt_gateway != 0)
1131 raddr = rt->rt_gateway;
1132 }1133 else1134 raddr = target_addr;
1135
1136 /*1137 * Having picked a route we can now send the frame out.1138 */1139
1140 dev2 = rt->rt_dev;
1141
1142 /*1143 * In IP you never have to forward a frame on the interface that it 1144 * arrived upon. We now generate an ICMP HOST REDIRECT giving the route1145 * we calculated.1146 */1147 #ifndef CONFIG_IP_NO_ICMP_REDIRECT
1148 if (dev == dev2 && !((iph->saddr^iph->daddr)&dev->pa_mask) && rt->rt_flags&RTF_MODIFIED)
1149 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, raddr, dev);
1150 #endif1151
1152 /*1153 * We now allocate a new buffer, and copy the datagram into it.1154 * If the indicated interface is up and running, kick it.1155 */1156
1157 if (dev2->flags & IFF_UP)
1158 {1159 #ifdefCONFIG_IP_MASQUERADE1160 /*1161 * If this fragment needs masquerading, make it so...1162 * (Dont masquerade de-masqueraded fragments)1163 */1164 if (!(is_frag&4) && fw_res==2)
1165 ip_fw_masquerade(&skb, dev2);
1166 #endif1167
1168 /*1169 * Current design decrees we copy the packet. For identical header1170 * lengths we could avoid it. The new skb code will let us push1171 * data so the problem goes away then.1172 */1173
1174 skb2 = alloc_skb(dev2->hard_header_len + skb->len + 15, GFP_ATOMIC);
1175
1176 /*1177 * This is rare and since IP is tolerant of network failures1178 * quite harmless.1179 */1180
1181 if (skb2 == NULL)
1182 {1183 NETDEBUG(printk("\nIP: No memory available for IP forward\n"));
1184 return;
1185 }1186
1187
1188 /* Now build the MAC header. */1189 (void) ip_send(skb2, raddr, skb->len, dev2, dev2->pa_addr);
1190
1191 ptr = skb_put(skb2,skb->len);
1192 skb2->free = 1;
1193 skb2->h.raw = ptr;
1194
1195 /*1196 * Copy the packet data into the new buffer.1197 */1198 memcpy(ptr, skb->h.raw, skb->len);
1199
1200
1201 ip_statistics.IpForwDatagrams++;
1202
1203 /*1204 * See if it needs fragmenting. Note in ip_rcv we tagged1205 * the fragment type. This must be right so that1206 * the fragmenter does the right thing.1207 */1208
1209 if(skb2->len > dev2->mtu + dev2->hard_header_len)
1210 {1211 ip_fragment(NULL,skb2,dev2, is_frag);
1212 kfree_skb(skb2,FREE_WRITE);
1213 }1214 else1215 {1216 #ifdefCONFIG_IP_ACCT1217 /*1218 * Count mapping we shortcut1219 */1220
1221 ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
1222 #endif1223
1224 /*1225 * Map service types to priority. We lie about1226 * throughput being low priority, but it's a good1227 * choice to help improve general usage.1228 */1229 if(iph->tos & IPTOS_LOWDELAY)
1230 dev_queue_xmit(skb2, dev2, SOPRI_INTERACTIVE);
1231 elseif(iph->tos & IPTOS_THROUGHPUT)
1232 dev_queue_xmit(skb2, dev2, SOPRI_BACKGROUND);
1233 else1234 dev_queue_xmit(skb2, dev2, SOPRI_NORMAL);
1235 }1236 }1237 }1238
1239
1240 #endif1241
1242 /*1243 * This function receives all incoming IP datagrams.1244 *1245 * On entry skb->data points to the start of the IP header and1246 * the MAC header has been removed.1247 */1248
1249 intip_rcv(structsk_buff *skb, structdevice *dev, structpacket_type *pt)
/* */1250 {1251 structiphdr *iph = skb->h.iph;
1252 structsock *raw_sk=NULL;
1253 unsignedcharhash;
1254 unsignedcharflag = 0;
1255 structinet_protocol *ipprot;
1256 intbrd=IS_MYADDR;
1257 unsignedlongtarget_addr;
1258 inttarget_strict=0;
1259 intis_frag=0;
1260 #ifdefCONFIG_IP_FIREWALL1261 interr;
1262 #endif1263
1264 ip_statistics.IpInReceives++;
1265
1266 /*1267 * Tag the ip header of this packet so we can find it1268 */1269
1270 skb->ip_hdr = iph;
1271
1272 /*1273 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.1274 * RFC1122: 3.1.2.3 MUST discard a frame with invalid source address [NEEDS FIXING].1275 *1276 * Is the datagram acceptable?1277 *1278 * 1. Length at least the size of an ip header1279 * 2. Version of 41280 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]1281 * 4. Doesn't have a bogus length1282 * (5. We ought to check for IP multicast addresses and undefined types.. does this matter ?)1283 */1284
1285 if (skb->len<sizeof(structiphdr) || iph->ihl<5 || iph->version != 4 || ip_fast_csum((unsignedchar *)iph, iph->ihl) !=0
1286 || skb->len < ntohs(iph->tot_len))
1287 {1288 ip_statistics.IpInHdrErrors++;
1289 kfree_skb(skb, FREE_WRITE);
1290 return(0);
1291 }1292
1293 /*1294 * Our transport medium may have padded the buffer out. Now we know it1295 * is IP we can trim to the true length of the frame.1296 */1297
1298 skb_trim(skb,ntohs(iph->tot_len));
1299
1300 /*1301 * See if the firewall wants to dispose of the packet. 1302 */1303
1304 #ifdefCONFIG_IP_FIREWALL1305
1306 if ((err=ip_fw_chk(iph,dev,ip_fw_blk_chain,ip_fw_blk_policy, 0))<1)
1307 {1308 if(err==-1)
1309 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev);
1310 kfree_skb(skb, FREE_WRITE);
1311 return 0;
1312 }1313
1314 #endif1315
1316
1317 /*1318 * Next analyse the packet for options. Studies show under one packet in1319 * a thousand have options....1320 */1321
1322 target_addr = iph->daddr;
1323
1324 if (iph->ihl != 5)
1325 {1326 /* Humph.. options. Lots of annoying fiddly bits */1327
1328 /*1329 * This is straight from the RFC. It might even be right ;)1330 *1331 * RFC 1122: 3.2.1.8 STREAMID option is obsolete and MUST be ignored.1332 * RFC 1122: 3.2.1.8 MUST NOT crash on a zero length option.1333 * RFC 1122: 3.2.1.8 MUST support acting as final destination of a source route.1334 */1335
1336 intopt_space=4*(iph->ihl-5);
1337 intopt_size;
1338 unsignedchar *opt_ptr=skb->h.raw+sizeof(structiphdr);
1339
1340 skb->ip_summed=0; /* Our free checksum is bogus for this case */1341
1342 while(opt_space>0)
1343 {1344 if(*opt_ptr==IPOPT_NOOP)
1345 {1346 opt_ptr++;
1347 opt_space--;
1348 continue;
1349 }1350 if(*opt_ptr==IPOPT_END)
1351 break; /* Done */1352 if(opt_space<2 || (opt_size=opt_ptr[1])<2 || opt_ptr[1]>opt_space)
1353 {1354 /*1355 * RFC 1122: 3.2.2.5 SHOULD send parameter problem reports.1356 */1357 icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev);
1358 kfree_skb(skb, FREE_READ);
1359 return -EINVAL;
1360 }1361 switch(opt_ptr[0])
1362 {1363 caseIPOPT_SEC:
1364 /* Should we drop this ?? */1365 break;
1366 caseIPOPT_SSRR: /* These work almost the same way */1367 target_strict=1;
1368 /* Fall through */1369 caseIPOPT_LSRR:
1370 #ifdef CONFIG_IP_NOSR
1371 kfree_skb(skb, FREE_READ);
1372 return -EINVAL;
1373 #endif1374 caseIPOPT_RR:
1375 /*1376 * RFC 1122: 3.2.1.8 Support for RR is OPTIONAL.1377 */1378 if (iph->daddr!=skb->dev->pa_addr && (brd = ip_chk_addr(iph->daddr)) == 0)
1379 break;
1380 if((opt_size<3) || ( opt_ptr[0]==IPOPT_RR && opt_ptr[2] > opt_size-4 ))
1381 {1382 if(ip_chk_addr(iph->daddr))
1383 icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev);
1384 kfree_skb(skb, FREE_READ);
1385 return -EINVAL;
1386 }1387 if(opt_ptr[2] > opt_size-4 )
1388 break;
1389 /* Bytes are [IPOPT_xxRR][Length][EntryPointer][Entry0][Entry1].... */1390 /* This isn't going to be too portable - FIXME */1391 if(opt_ptr[0]!=IPOPT_RR)
1392 {1393 intt;
1394 target_addr=*(u32 *)(&opt_ptr[opt_ptr[2]]); /* Get hop */1395 t=ip_chk_addr(target_addr);
1396 if(t==IS_MULTICAST||t==IS_BROADCAST)
1397 {1398 if(ip_chk_addr(iph->daddr))
1399 icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev);
1400 kfree_skb(skb,FREE_READ);
1401 return -EINVAL;
1402 }1403 }1404 *(u32 *)(&opt_ptr[opt_ptr[2]])=skb->dev->pa_addr; /* Record hop */1405 break;
1406 caseIPOPT_TIMESTAMP:
1407 /*1408 * RFC 1122: 3.2.1.8 The timestamp option is OPTIONAL but if implemented1409 * MUST meet various rules (read the spec).1410 */1411 NETDEBUG(printk("ICMP: Someone finish the timestamp routine ;)\n"));
1412 break;
1413 default:
1414 break;
1415 }1416 opt_ptr+=opt_size;
1417 opt_space-=opt_size;
1418 }1419
1420 }1421
1422
1423 /*1424 * Remember if the frame is fragmented.1425 */1426
1427 if(iph->frag_off)
1428 {1429 if (iph->frag_off & htons(IP_MF))
1430 is_frag|=1;
1431 /*1432 * Last fragment ?1433 */1434
1435 if (iph->frag_off & htons(IP_OFFSET))
1436 is_frag|=2;
1437 }1438
1439 /*1440 * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday.1441 *1442 * This is inefficient. While finding out if it is for us we could also compute1443 * the routing table entry. This is where the great unified cache theory comes1444 * in as and when someone implements it1445 *1446 * For most hosts over 99% of packets match the first conditional1447 * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at1448 * function entry.1449 */1450
1451 if ( iph->daddr == skb->dev->pa_addr || (brd = ip_chk_addr(iph->daddr)) != 0)
1452 {1453 #ifdefCONFIG_IP_MULTICAST1454
1455 if(brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK))
1456 {1457 /*1458 * Check it is for one of our groups1459 */1460 structip_mc_list *ip_mc=dev->ip_mc_list;
1461 do1462 {1463 if(ip_mc==NULL)
1464 {1465 kfree_skb(skb, FREE_WRITE);
1466 return 0;
1467 }1468 if(ip_mc->multiaddr==iph->daddr)
1469 break;
1470 ip_mc=ip_mc->next;
1471 }1472 while(1);
1473 }1474 #endif1475
1476 #ifdefCONFIG_IP_MASQUERADE1477 /*1478 * Do we need to de-masquerade this fragment?1479 */1480 if (ip_fw_demasquerade(skb))
1481 {1482 structiphdr *iph=skb->h.iph;
1483 ip_forward(skb, dev, is_frag|4, iph->daddr, 0);
1484 kfree_skb(skb, FREE_WRITE);
1485 return(0);
1486 }1487 #endif1488
1489 /*1490 * Account for the packet1491 */1492
1493 #ifdefCONFIG_IP_ACCT1494 ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
1495 #endif1496
1497 /*1498 * Reassemble IP fragments.1499 */1500
1501 if(is_frag)
1502 {1503 /* Defragment. Obtain the complete packet if there is one */1504 skb=ip_defrag(iph,skb,dev);
1505 if(skb==NULL)
1506 return 0;
1507 skb->dev = dev;
1508 iph=skb->h.iph;
1509 }1510
1511 /*1512 * Point into the IP datagram, just past the header.1513 */1514
1515 skb->ip_hdr = iph;
1516 skb->h.raw += iph->ihl*4;
1517
1518 /*1519 * Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies.1520 *1521 * RFC 1122: SHOULD pass TOS value up to the transport layer.1522 */1523
1524 hash = iph->protocol & (SOCK_ARRAY_SIZE-1);
1525
1526 /* 1527 * If there maybe a raw socket we must check - if not we don't care less 1528 */1529
1530 if((raw_sk=raw_prot.sock_array[hash])!=NULL)
1531 {1532 structsock *sknext=NULL;
1533 structsk_buff *skb1;
1534 raw_sk=get_sock_raw(raw_sk, hash, iph->saddr, iph->daddr);
1535 if(raw_sk) /* Any raw sockets */1536 {1537 do1538 {1539 /* Find the next */1540 sknext=get_sock_raw(raw_sk->next, hash, iph->saddr, iph->daddr);
1541 if(sknext)
1542 skb1=skb_clone(skb, GFP_ATOMIC);
1543 else1544 break; /* One pending raw socket left */1545 if(skb1)
1546 raw_rcv(raw_sk, skb1, dev, iph->saddr,iph->daddr);
1547 raw_sk=sknext;
1548 }1549 while(raw_sk!=NULL);
1550
1551 /*1552 * Here either raw_sk is the last raw socket, or NULL if none 1553 */1554
1555 /*1556 * We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy 1557 */1558 }1559 }1560
1561 /*1562 * skb->h.raw now points at the protocol beyond the IP header.1563 */1564
1565 hash = iph->protocol & (MAX_INET_PROTOS -1);
1566 for (ipprot = (structinet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(structinet_protocol *)ipprot->next)
1567 {1568 structsk_buff *skb2;
1569
1570 if (ipprot->protocol != iph->protocol)
1571 continue;
1572 /*1573 * See if we need to make a copy of it. This will1574 * only be set if more than one protocol wants it.1575 * and then not for the last one. If there is a pending1576 * raw delivery wait for that1577 */1578
1579 if (ipprot->copy || raw_sk)
1580 {1581 skb2 = skb_clone(skb, GFP_ATOMIC);
1582 if(skb2==NULL)
1583 continue;
1584 }1585 else1586 {1587 skb2 = skb;
1588 }1589 flag = 1;
1590
1591 /*1592 * Pass on the datagram to each protocol that wants it,1593 * based on the datagram protocol. We should really1594 * check the protocol handler's return values here...1595 */1596
1597 ipprot->handler(skb2, dev, NULL, iph->daddr,
1598 (ntohs(iph->tot_len) - (iph->ihl * 4)),
1599 iph->saddr, 0, ipprot);
1600
1601 }1602
1603 /*1604 * All protocols checked.1605 * If this packet was a broadcast, we may *not* reply to it, since that1606 * causes (proven, grin) ARP storms and a leakage of memory (i.e. all1607 * ICMP reply messages get queued up for transmission...)1608 */1609
1610 if(raw_sk!=NULL) /* Shift to last raw user */1611 raw_rcv(raw_sk, skb, dev, iph->saddr, iph->daddr);
1612 elseif (!flag) /* Free and report errors */1613 {1614 if (brd != IS_BROADCAST && brd!=IS_MULTICAST)
1615 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev);
1616 kfree_skb(skb, FREE_WRITE);
1617 }1618
1619 return(0);
1620 }1621
1622 /*1623 * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday.1624 *1625 * This is inefficient. While finding out if it is for us we could also compute1626 * the routing table entry. This is where the great unified cache theory comes1627 * in as and when someone implements it1628 *1629 * For most hosts over 99% of packets match the first conditional1630 * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at1631 * function entry.1632 */1633
1634 /*1635 * Don't forward multicast or broadcast frames.1636 */1637
1638 if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST)
1639 {1640 kfree_skb(skb,FREE_WRITE);
1641 return 0;
1642 }1643
1644 /*1645 * The packet is for another target. Forward the frame1646 */1647
1648 #ifdefCONFIG_IP_FORWARD1649 ip_forward(skb, dev, is_frag, target_addr, target_strict);
1650 #else1651 /* printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n",1652 iph->saddr,iph->daddr);*/1653 ip_statistics.IpInAddrErrors++;
1654 #endif1655 /*1656 * The forwarder is inefficient and copies the packet. We1657 * free the original now.1658 */1659
1660 kfree_skb(skb, FREE_WRITE);
1661 return(0);
1662 }1663
1664
1665 /*1666 * Loop a packet back to the sender.1667 */1668
1669 staticvoidip_loopback(structdevice *old_dev, structsk_buff *skb)
/* */1670 {1671 externstructdeviceloopback_dev;
1672 structdevice *dev=&loopback_dev;
1673 intlen=skb->len-old_dev->hard_header_len;
1674 structsk_buff *newskb=dev_alloc_skb(len+dev->hard_header_len+15);
1675
1676 if(newskb==NULL)
1677 return;
1678
1679 newskb->link3=NULL;
1680 newskb->sk=NULL;
1681 newskb->dev=dev;
1682 newskb->saddr=skb->saddr;
1683 newskb->daddr=skb->daddr;
1684 newskb->raddr=skb->raddr;
1685 newskb->free=1;
1686 newskb->lock=0;
1687 newskb->users=0;
1688 newskb->pkt_type=skb->pkt_type;
1689
1690 /*1691 * Put a MAC header on the packet1692 */1693 ip_send(newskb, skb->ip_hdr->daddr, len, dev, skb->ip_hdr->saddr);
1694 /*1695 * Add the rest of the data space. 1696 */1697 newskb->ip_hdr=(structiphdr *)skb_put(skb, len);
1698 /*1699 * Copy the data1700 */1701 memcpy(newskb->ip_hdr,skb->ip_hdr,len);
1702
1703 /* Recurse. The device check against IFF_LOOPBACK will stop infinite recursion */1704
1705 /*printk("Loopback output queued [%lX to %lX].\n", newskb->ip_hdr->saddr,newskb->ip_hdr->daddr);*/1706 ip_queue_xmit(NULL, dev, newskb, 1);
1707 }1708
1709
1710 /*1711 * Queues a packet to be sent, and starts the transmitter1712 * if necessary. if free = 1 then we free the block after1713 * transmit, otherwise we don't. If free==2 we not only1714 * free the block but also don't assign a new ip seq number.1715 * This routine also needs to put in the total length,1716 * and compute the checksum1717 */1718
1719 voidip_queue_xmit(structsock *sk, structdevice *dev,
/* */1720 structsk_buff *skb, intfree)
1721 {1722 structiphdr *iph;
1723 unsignedchar *ptr;
1724
1725 /* Sanity check */1726 if (dev == NULL)
1727 {1728 NETDEBUG(printk("IP: ip_queue_xmit dev = NULL\n"));
1729 return;
1730 }1731
1732 IS_SKB(skb);
1733
1734 /*1735 * Do some book-keeping in the packet for later1736 */1737
1738
1739 skb->dev = dev;
1740 skb->when = jiffies;
1741
1742 /*1743 * Find the IP header and set the length. This is bad1744 * but once we get the skb data handling code in the1745 * hardware will push its header sensibly and we will1746 * set skb->ip_hdr to avoid this mess and the fixed1747 * header length problem1748 */1749
1750 ptr = skb->data;
1751 ptr += dev->hard_header_len;
1752 iph = (structiphdr *)ptr;
1753 skb->ip_hdr = iph;
1754 iph->tot_len = ntohs(skb->len-dev->hard_header_len);
1755
1756 #ifdefCONFIG_IP_FIREWALL1757 if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)
1758 /* just don't send this packet */1759 return;
1760 #endif1761
1762 /*1763 * No reassigning numbers to fragments...1764 */1765
1766 if(free!=2)
1767 iph->id = htons(ip_id_count++);
1768 else1769 free=1;
1770
1771 /* All buffers without an owner socket get freed */1772 if (sk == NULL)
1773 free = 1;
1774
1775 skb->free = free;
1776
1777 /*1778 * Do we need to fragment. Again this is inefficient.1779 * We need to somehow lock the original buffer and use1780 * bits of it.1781 */1782
1783 if(skb->len > dev->mtu + dev->hard_header_len)
1784 {1785 ip_fragment(sk,skb,dev,0);
1786 IS_SKB(skb);
1787 kfree_skb(skb,FREE_WRITE);
1788 return;
1789 }1790
1791 /*1792 * Add an IP checksum1793 */1794
1795 ip_send_check(iph);
1796
1797 /*1798 * Print the frame when debugging1799 */1800
1801 /*1802 * More debugging. You cannot queue a packet already on a list1803 * Spot this and moan loudly.1804 */1805 if (skb->next != NULL)
1806 {1807 NETDEBUG(printk("ip_queue_xmit: next != NULL\n"));
1808 skb_unlink(skb);
1809 }1810
1811 /*1812 * If a sender wishes the packet to remain unfreed1813 * we add it to his send queue. This arguably belongs1814 * in the TCP level since nobody else uses it. BUT1815 * remember IPng might change all the rules.1816 */1817
1818 if (!free)
1819 {1820 unsignedlongflags;
1821 /* The socket now has more outstanding blocks */1822
1823 sk->packets_out++;
1824
1825 /* Protect the list for a moment */1826 save_flags(flags);
1827 cli();
1828
1829 if (skb->link3 != NULL)
1830 {1831 NETDEBUG(printk("ip.c: link3 != NULL\n"));
1832 skb->link3 = NULL;
1833 }1834 if (sk->send_head == NULL)
1835 {1836 sk->send_tail = skb;
1837 sk->send_head = skb;
1838 }1839 else1840 {1841 sk->send_tail->link3 = skb;
1842 sk->send_tail = skb;
1843 }1844 /* skb->link3 is NULL */1845
1846 /* Interrupt restore */1847 restore_flags(flags);
1848 }1849 else1850 /* Remember who owns the buffer */1851 skb->sk = sk;
1852
1853 /*1854 * If the indicated interface is up and running, send the packet.1855 */1856
1857 ip_statistics.IpOutRequests++;
1858 #ifdefCONFIG_IP_ACCT1859 ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
1860 #endif1861
1862 #ifdefCONFIG_IP_MULTICAST1863
1864 /*1865 * Multicasts are looped back for other local users1866 */1867
1868 if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK))
1869 {1870 if(sk==NULL || sk->ip_mc_loop)
1871 {1872 if(iph->daddr==IGMP_ALL_HOSTS)
1873 ip_loopback(dev,skb);
1874 else1875 {1876 structip_mc_list *imc=dev->ip_mc_list;
1877 while(imc!=NULL)
1878 {1879 if(imc->multiaddr==iph->daddr)
1880 {1881 ip_loopback(dev,skb);
1882 break;
1883 }1884 imc=imc->next;
1885 }1886 }1887 }1888 /* Multicasts with ttl 0 must not go beyond the host */1889
1890 if(skb->ip_hdr->ttl==0)
1891 {1892 kfree_skb(skb, FREE_READ);
1893 return;
1894 }1895 }1896 #endif1897 if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))
1898 ip_loopback(dev,skb);
1899
1900 if (dev->flags & IFF_UP)
1901 {1902 /*1903 * If we have an owner use its priority setting,1904 * otherwise use NORMAL1905 */1906
1907 if (sk != NULL)
1908 {1909 dev_queue_xmit(skb, dev, sk->priority);
1910 }1911 else1912 {1913 dev_queue_xmit(skb, dev, SOPRI_NORMAL);
1914 }1915 }1916 else1917 {1918 ip_statistics.IpOutDiscards++;
1919 if (free)
1920 kfree_skb(skb, FREE_WRITE);
1921 }1922 }1923
1924
1925
1926 #ifdefCONFIG_IP_MULTICAST1927
1928 /*1929 * Write an multicast group list table for the IGMP daemon to1930 * read.1931 */1932
1933 intip_mc_procinfo(char *buffer, char **start, off_toffset, intlength)
/* */1934 {1935 off_tpos=0, begin=0;
1936 structip_mc_list *im;
1937 unsignedlongflags;
1938 intlen=0;
1939 structdevice *dev;
1940
1941 len=sprintf(buffer,"Device : Count\tGroup Users Timer\n");
1942 save_flags(flags);
1943 cli();
1944
1945 for(dev = dev_base; dev; dev = dev->next)
1946 {1947 if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST))
1948 {1949 len+=sprintf(buffer+len,"%-10s: %5d\n",
1950 dev->name, dev->mc_count);
1951 for(im = dev->ip_mc_list; im; im = im->next)
1952 {1953 len+=sprintf(buffer+len,
1954 "\t\t\t%08lX %5d %d:%08lX\n",
1955 im->multiaddr, im->users,
1956 im->tm_running, im->timer.expires);
1957 pos=begin+len;
1958 if(pos<offset)
1959 {1960 len=0;
1961 begin=pos;
1962 }1963 if(pos>offset+length)
1964 break;
1965 }1966 }1967 }1968 restore_flags(flags);
1969 *start=buffer+(offset-begin);
1970 len-=(offset-begin);
1971 if(len>length)
1972 len=length;
1973 returnlen;
1974 }1975
1976
1977 #endif1978 /*1979 * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on1980 * an IP socket.1981 *1982 * We implement IP_TOS (type of service), IP_TTL (time to live).1983 *1984 * Next release we will sort out IP_OPTIONS since for some people are kind of important.1985 */1986
1987 staticstructdevice *ip_mc_find_devfor(unsignedlongaddr)
/* */1988 {1989 structdevice *dev;
1990 for(dev = dev_base; dev; dev = dev->next)
1991 {1992 if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST)&&
1993 (dev->pa_addr==addr))
1994 returndev;
1995 }1996
1997 returnNULL;
1998 }1999
2000 intip_setsockopt(structsock *sk, intlevel, intoptname, char *optval, intoptlen)
/* */2001 {2002 intval,err;
2003 unsignedcharucval;
2004 #ifdefined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT)
2005 structip_fwtmp_fw;
2006 #endif2007 if (optval == NULL)
2008 return(-EINVAL);
2009
2010 err=verify_area(VERIFY_READ, optval, sizeof(int));
2011 if(err)
2012 returnerr;
2013
2014 val = get_user((int *) optval);
2015 ucval=get_user((unsignedchar *) optval);
2016
2017 if(level!=SOL_IP)
2018 return -EOPNOTSUPP;
2019
2020 switch(optname)
2021 {2022 caseIP_TOS:
2023 if(val<0||val>255)
2024 return -EINVAL;
2025 sk->ip_tos=val;
2026 if(val==IPTOS_LOWDELAY)
2027 sk->priority=SOPRI_INTERACTIVE;
2028 if(val==IPTOS_THROUGHPUT)
2029 sk->priority=SOPRI_BACKGROUND;
2030 return 0;
2031 caseIP_TTL:
2032 if(val<1||val>255)
2033 return -EINVAL;
2034 sk->ip_ttl=val;
2035 return 0;
2036 #ifdefCONFIG_IP_MULTICAST2037 caseIP_MULTICAST_TTL:
2038 {2039 sk->ip_mc_ttl=(int)ucval;
2040 return 0;
2041 }2042 caseIP_MULTICAST_LOOP:
2043 {2044 if(ucval!=0 && ucval!=1)
2045 return -EINVAL;
2046 sk->ip_mc_loop=(int)ucval;
2047 return 0;
2048 }2049 caseIP_MULTICAST_IF:
2050 {2051 structin_addraddr;
2052 structdevice *dev=NULL;
2053
2054 /*2055 * Check the arguments are allowable2056 */2057
2058 err=verify_area(VERIFY_READ, optval, sizeof(addr));
2059 if(err)
2060 returnerr;
2061
2062 memcpy_fromfs(&addr,optval,sizeof(addr));
2063
2064
2065 /*2066 * What address has been requested2067 */2068
2069 if(addr.s_addr==INADDR_ANY) /* Default */2070 {2071 sk->ip_mc_name[0]=0;
2072 return 0;
2073 }2074
2075 /*2076 * Find the device2077 */2078
2079 dev=ip_mc_find_devfor(addr.s_addr);
2080
2081 /*2082 * Did we find one2083 */2084
2085 if(dev)
2086 {2087 strcpy(sk->ip_mc_name,dev->name);
2088 return 0;
2089 }2090 return -EADDRNOTAVAIL;
2091 }2092
2093 caseIP_ADD_MEMBERSHIP:
2094 {2095
2096 /*2097 * FIXME: Add/Del membership should have a semaphore protecting them from re-entry2098 */2099 structip_mreqmreq;
2100 unsignedlongroute_src;
2101 structrtable *rt;
2102 structdevice *dev=NULL;
2103
2104 /*2105 * Check the arguments.2106 */2107
2108 err=verify_area(VERIFY_READ, optval, sizeof(mreq));
2109 if(err)
2110 returnerr;
2111
2112 memcpy_fromfs(&mreq,optval,sizeof(mreq));
2113
2114 /* 2115 * Get device for use later2116 */2117
2118 if(mreq.imr_interface.s_addr==INADDR_ANY)
2119 {2120 /*2121 * Not set so scan.2122 */2123 if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,NULL, &route_src))!=NULL)
2124 {2125 dev=rt->rt_dev;
2126 rt->rt_use--;
2127 }2128 }2129 else2130 {2131 /*2132 * Find a suitable device.2133 */2134
2135 dev=ip_mc_find_devfor(mreq.imr_interface.s_addr);
2136 }2137
2138 /*2139 * No device, no cookies.2140 */2141
2142 if(!dev)
2143 return -ENODEV;
2144
2145 /*2146 * Join group.2147 */2148
2149 returnip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr);
2150 }2151
2152 caseIP_DROP_MEMBERSHIP:
2153 {2154 structip_mreqmreq;
2155 structrtable *rt;
2156 unsignedlongroute_src;
2157 structdevice *dev=NULL;
2158
2159 /*2160 * Check the arguments2161 */2162
2163 err=verify_area(VERIFY_READ, optval, sizeof(mreq));
2164 if(err)
2165 returnerr;
2166
2167 memcpy_fromfs(&mreq,optval,sizeof(mreq));
2168
2169 /*2170 * Get device for use later 2171 */2172
2173 if(mreq.imr_interface.s_addr==INADDR_ANY)
2174 {2175 if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,NULL, &route_src))!=NULL)
2176 {2177 dev=rt->rt_dev;
2178 rt->rt_use--;
2179 }2180 }2181 else2182 {2183
2184 dev=ip_mc_find_devfor(mreq.imr_interface.s_addr);
2185 }2186
2187 /*2188 * Did we find a suitable device.2189 */2190
2191 if(!dev)
2192 return -ENODEV;
2193
2194 /*2195 * Leave group2196 */2197
2198 returnip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr);
2199 }2200 #endif2201 #ifdefCONFIG_IP_FIREWALL2202 caseIP_FW_ADD_BLK:
2203 caseIP_FW_DEL_BLK:
2204 caseIP_FW_ADD_FWD:
2205 caseIP_FW_DEL_FWD:
2206 caseIP_FW_CHK_BLK:
2207 caseIP_FW_CHK_FWD:
2208 caseIP_FW_FLUSH_BLK:
2209 caseIP_FW_FLUSH_FWD:
2210 caseIP_FW_ZERO_BLK:
2211 caseIP_FW_ZERO_FWD:
2212 caseIP_FW_POLICY_BLK:
2213 caseIP_FW_POLICY_FWD:
2214 if(!suser())
2215 return -EPERM;
2216 if(optlen>sizeof(tmp_fw) || optlen<1)
2217 return -EINVAL;
2218 err=verify_area(VERIFY_READ,optval,optlen);
2219 if(err)
2220 returnerr;
2221 memcpy_fromfs(&tmp_fw,optval,optlen);
2222 err=ip_fw_ctl(optname, &tmp_fw,optlen);
2223 return -err; /* -0 is 0 after all */2224
2225 #endif2226 #ifdefCONFIG_IP_ACCT2227 caseIP_ACCT_DEL:
2228 caseIP_ACCT_ADD:
2229 caseIP_ACCT_FLUSH:
2230 caseIP_ACCT_ZERO:
2231 if(!suser())
2232 return -EPERM;
2233 if(optlen>sizeof(tmp_fw) || optlen<1)
2234 return -EINVAL;
2235 err=verify_area(VERIFY_READ,optval,optlen);
2236 if(err)
2237 returnerr;
2238 memcpy_fromfs(&tmp_fw, optval,optlen);
2239 err=ip_acct_ctl(optname, &tmp_fw,optlen);
2240 return -err; /* -0 is 0 after all */2241 #endif2242 /* IP_OPTIONS and friends go here eventually */2243 default:
2244 return(-ENOPROTOOPT);
2245 }2246 }2247
2248 /*2249 * Get the options. Note for future reference. The GET of IP options gets the2250 * _received_ ones. The set sets the _sent_ ones.2251 */2252
2253 intip_getsockopt(structsock *sk, intlevel, intoptname, char *optval, int *optlen)
/* */2254 {2255 intval,err;
2256 #ifdefCONFIG_IP_MULTICAST2257 intlen;
2258 #endif2259
2260 if(level!=SOL_IP)
2261 return -EOPNOTSUPP;
2262
2263 switch(optname)
2264 {2265 caseIP_TOS:
2266 val=sk->ip_tos;
2267 break;
2268 caseIP_TTL:
2269 val=sk->ip_ttl;
2270 break;
2271 #ifdefCONFIG_IP_MULTICAST2272 caseIP_MULTICAST_TTL:
2273 val=sk->ip_mc_ttl;
2274 break;
2275 caseIP_MULTICAST_LOOP:
2276 val=sk->ip_mc_loop;
2277 break;
2278 caseIP_MULTICAST_IF:
2279 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2280 if(err)
2281 returnerr;
2282 len=strlen(sk->ip_mc_name);
2283 err=verify_area(VERIFY_WRITE, optval, len);
2284 if(err)
2285 returnerr;
2286 put_user(len,(int *) optlen);
2287 memcpy_tofs((void *)optval,sk->ip_mc_name, len);
2288 return 0;
2289 #endif2290 default:
2291 return(-ENOPROTOOPT);
2292 }2293 err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2294 if(err)
2295 returnerr;
2296 put_user(sizeof(int),(int *) optlen);
2297
2298 err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2299 if(err)
2300 returnerr;
2301 put_user(val,(int *) optval);
2302
2303 return(0);
2304 }2305
2306 /*2307 * Build and send a packet, with as little as one copy2308 *2309 * Doesn't care much about ip options... option length can be2310 * different for fragment at 0 and other fragments.2311 *2312 * Note that the fragment at the highest offset is sent first,2313 * so the getfrag routine can fill in the TCP/UDP checksum header2314 * field in the last fragment it sends... actually it also helps2315 * the reassemblers, they can put most packets in at the head of2316 * the fragment queue, and they know the total size in advance. This2317 * last feature will measurable improve the Linux fragment handler.2318 *2319 * The callback has five args, an arbitrary pointer (copy of frag),2320 * the source IP address (may depend on the routing table), the 2321 * destination adddress (char *), the offset to copy from, and the2322 * length to be copied.2323 * 2324 */2325
2326 intip_build_xmit(structsock *sk,
/* */2327 voidgetfrag (void *,
2328 int,
2329 char *,
2330 unsignedint,
2331 unsignedint),
2332 void *frag,
2333 unsignedshortintlength,
2334 intdaddr,
2335 intflags,
2336 inttype)
2337 {2338 structrtable *rt;
2339 unsignedintfraglen, maxfraglen, fragheaderlen;
2340 intoffset, mf;
2341 unsignedlongsaddr;
2342 unsignedshortid;
2343 structiphdr *iph;
2344 intlocal=0;
2345 structdevice *dev;
2346
2347
2348 #ifdefCONFIG_INET_MULTICAST2349 if(sk && MULTICAST(daddr) && *sk->ip_mc_name)
2350 {2351 dev=dev_get(skb->ip_mc_name);
2352 if(!dev)
2353 return -ENODEV;
2354 rt=NULL;
2355 }2356 else2357 {2358 #endif2359 /*2360 * Perform the IP routing decisions2361 */2362
2363 if(sk->localroute || flags&MSG_DONTROUTE)
2364 local=1;
2365
2366 rt = sk->ip_route_cache;
2367
2368 /*2369 * See if the routing cache is outdated. We need to clean this up once we are happy it is reliable2370 * by doing the invalidation actively in the route change and header change.2371 */2372
2373 saddr=sk->ip_route_saddr;
2374 if(!rt || sk->ip_route_stamp != rt_stamp || daddr!=sk->ip_route_daddr || sk->ip_route_local!=local || sk->saddr!=sk->ip_route_saddr)
2375 {2376 if(local)
2377 rt = ip_rt_local(daddr, NULL, &saddr);
2378 else2379 rt = ip_rt_route(daddr, NULL, &saddr);
2380 sk->ip_route_local=local;
2381 sk->ip_route_daddr=daddr;
2382 sk->ip_route_saddr=saddr;
2383 sk->ip_route_stamp=rt_stamp;
2384 sk->ip_route_cache=rt;
2385 sk->ip_hcache_ver=NULL;
2386 sk->ip_hcache_state= 0;
2387 }2388 elseif(rt)
2389 {2390 /*2391 * Attempt header caches only if the cached route is being reused. Header cache2392 * is not ultra cheap to set up. This means we only set it up on the second packet,2393 * so one shot communications are not slowed. We assume (seems reasonable) that 2 is2394 * probably going to be a stream of data.2395 */2396 if(rt->rt_dev->header_cache && sk->ip_hcache_state!= -1)
2397 {2398 if(sk->ip_hcache_ver==NULL || sk->ip_hcache_stamp!=*sk->ip_hcache_ver)
2399 rt->rt_dev->header_cache(rt->rt_dev,sk,saddr,daddr);
2400 else2401 /* Can't cache. Remember this */2402 sk->ip_hcache_state= -1;
2403 }2404 }2405
2406 if (rt == NULL)
2407 {2408 ip_statistics.IpOutNoRoutes++;
2409 return(-ENETUNREACH);
2410 }2411
2412 if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr)))
2413 saddr = sk->saddr;
2414
2415 dev=rt->rt_dev;
2416 #ifdefCONFIG_INET_MULTICAST2417 }2418 #endif2419
2420 /*2421 * Now compute the buffer space we require2422 */2423
2424 fragheaderlen = dev->hard_header_len;
2425 if(type != IPPROTO_RAW)
2426 fragheaderlen += 20;
2427
2428 /*2429 * Fragheaderlen is the size of 'overhead' on each buffer. Now work2430 * out the size of the frames to send.2431 */2432
2433 maxfraglen = ((dev->mtu-20) & ~7) + fragheaderlen;
2434
2435 /*2436 * Start at the end of the frame by handling the remainder.2437 */2438
2439 offset = length - (length % (maxfraglen - fragheaderlen));
2440
2441 /*2442 * Amount of memory to allocate for final fragment.2443 */2444
2445 fraglen = length - offset + fragheaderlen;
2446
2447 if(fraglen==0)
2448 {2449 fraglen = maxfraglen;
2450 offset -= maxfraglen-fragheaderlen;
2451 }2452
2453
2454 /*2455 * The last fragment will not have MF (more fragments) set.2456 */2457
2458 mf = 0;
2459
2460 /*2461 * Can't fragment raw packets 2462 */2463
2464 if (type == IPPROTO_RAW && offset > 0)
2465 return(-EMSGSIZE);
2466
2467 /*2468 * Get an identifier2469 */2470
2471 id = htons(ip_id_count++);
2472
2473 /*2474 * Being outputting the bytes.2475 */2476
2477 do2478 {2479 structsk_buff * skb;
2480 interror;
2481 char *data;
2482
2483 /*2484 * Get the memory we require with some space left for alignment.2485 */2486
2487 skb = sock_alloc_send_skb(sk, fraglen+15, 0, &error);
2488 if (skb == NULL)
2489 return(error);
2490
2491 /*2492 * Fill in the control structures2493 */2494
2495 skb->next = skb->prev = NULL;
2496 skb->dev = dev;
2497 skb->when = jiffies;
2498 skb->free = 1; /* dubious, this one */2499 skb->sk = sk;
2500 skb->arp = 0;
2501 skb->saddr = saddr;
2502 skb->raddr = (rt&&rt->rt_gateway) ? rt->rt_gateway : daddr;
2503 skb_reserve(skb,(dev->hard_header_len+15)&~15);
2504 data = skb_put(skb, fraglen-dev->hard_header_len);
2505
2506 /*2507 * Save us ARP and stuff. In the optimal case we do no route lookup (route cache ok)2508 * no ARP lookup (arp cache ok) and output. The cache checks are still too slow but2509 * this can be fixed later. For gateway routes we ought to have a rt->.. header cache2510 * pointer to speed header cache builds for identical targets.2511 */2512
2513 if(sk->ip_hcache_state>0)
2514 {2515 memcpy(skb->data,sk->ip_hcache_data, dev->hard_header_len);
2516 skb->arp=1;
2517 }2518 elseif (dev->hard_header)
2519 {2520 if(dev->hard_header(skb, dev, ETH_P_IP,
2521 NULL, NULL, 0)>0)
2522 skb->arp=1;
2523 }2524
2525 /*2526 * Find where to start putting bytes.2527 */2528
2529 iph = (structiphdr *)data;
2530
2531 /*2532 * Only write IP header onto non-raw packets 2533 */2534
2535 if(type != IPPROTO_RAW)
2536 {2537
2538 iph->version = 4;
2539 iph->ihl = 5; /* ugh */2540 iph->tos = sk->ip_tos;
2541 iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
2542 iph->id = id;
2543 iph->frag_off = htons(offset>>3);
2544 iph->frag_off |= mf;
2545 #ifdefCONFIG_IP_MULTICAST2546 if (MULTICAST(daddr))
2547 iph->ttl = sk->ip_mc_ttl;
2548 else2549 #endif2550 iph->ttl = sk->ip_ttl;
2551 iph->protocol = type;
2552 iph->check = 0;
2553 iph->saddr = saddr;
2554 iph->daddr = daddr;
2555 iph->check = ip_fast_csum((unsignedchar *)iph, iph->ihl);
2556 data += iph->ihl*4;
2557
2558 /*2559 * Any further fragments will have MF set.2560 */2561
2562 mf = htons(IP_MF);
2563 }2564
2565 /*2566 * User data callback2567 */2568
2569 getfrag(frag, saddr, data, offset, fraglen-fragheaderlen);
2570
2571 /*2572 * Account for the fragment.2573 */2574
2575 #ifdefCONFIG_IP_ACCT2576 if(!offset)
2577 ip_fw_chk(iph, dev, ip_acct_chain, IP_FW_F_ACCEPT, 1);
2578 #endif2579 offset -= (maxfraglen-fragheaderlen);
2580 fraglen = maxfraglen;
2581
2582 #ifdefCONFIG_IP_MULTICAST2583
2584 /*2585 * Multicasts are looped back for other local users2586 */2587
2588 if (MULTICAST(daddr) && !(dev->flags&IFF_LOOPBACK))
2589 {2590 /*2591 * Loop back any frames. The check for IGMP_ALL_HOSTS is because2592 * you are always magically a member of this group.2593 */2594
2595 if(sk==NULL || sk->ip_mc_loop)
2596 {2597 if(skb->daddr==IGMP_ALL_HOSTS)
2598 ip_loopback(rt->rt_dev,skb);
2599 else2600 {2601 structip_mc_list *imc=rt->rt_dev->ip_mc_list;
2602 while(imc!=NULL)
2603 {2604 if(imc->multiaddr==daddr)
2605 {2606 ip_loopback(rt->rt_dev,skb);
2607 break;
2608 }2609 imc=imc->next;
2610 }2611 }2612 }2613
2614 /*2615 * Multicasts with ttl 0 must not go beyond the host. Fixme: avoid the2616 * extra clone.2617 */2618
2619 if(skb->ip_hdr->ttl==0)
2620 kfree_skb(skb, FREE_READ);
2621 }2622 #endif2623 /*2624 * Now queue the bytes into the device.2625 */2626
2627 if (dev->flags & IFF_UP)
2628 {2629 dev_queue_xmit(skb, dev, sk->priority);
2630 }2631 else2632 {2633 /*2634 * Whoops... 2635 *2636 * FIXME: There is a small nasty here. During the ip_build_xmit we could2637 * page fault between the route lookup and device send, the device might be2638 * removed and unloaded.... We need to add device locks on this.2639 */2640
2641 ip_statistics.IpOutDiscards++;
2642 kfree_skb(skb, FREE_WRITE);
2643 return(0); /* lose rest of fragments */2644 }2645 }2646 while (offset >= 0);
2647
2648 return(0);
2649 }2650
2651
2652 /*2653 * IP protocol layer initialiser2654 */2655
2656 staticstructpacket_typeip_packet_type =
2657 {2658 0, /* MUTTER ntohs(ETH_P_IP),*/2659 NULL, /* All devices */2660 ip_rcv,
2661 NULL,
2662 NULL,
2663 };
2664
2665 /*2666 * Device notifier2667 */2668
2669 staticintip_rt_event(unsignedlongevent, void *ptr)
/* */2670 {2671 if(event==NETDEV_DOWN)
2672 ip_rt_flush(ptr);
2673 returnNOTIFY_DONE;
2674 }2675
2676 structnotifier_blockip_rt_notifier={2677 ip_rt_event,
2678 NULL,
2679 0
2680 };
2681
2682 /*2683 * IP registers the packet type and then calls the subprotocol initialisers2684 */2685
2686 voidip_init(void)
/* */2687 {2688 ip_packet_type.type=htons(ETH_P_IP);
2689 dev_add_pack(&ip_packet_type);
2690
2691 /* So we flush routes when a device is downed */2692 register_netdevice_notifier(&ip_rt_notifier);
2693 /* ip_raw_init();2694 ip_packet_init();2695 ip_tcp_init();2696 ip_udp_init();*/2697 }2698