net/ipv4/ip.c

/* */
This source file includes following definitions.
ip_ioctl
ip_send
ip_build_header
ip_send_check
ip_frag_create
ip_find
ip_free
ip_expire
ip_create
ip_done
ip_glue
ip_defrag
ip_fragment
ip_forward
ip_rcv
ip_loopback
ip_queue_xmit
ip_mc_procinfo
ip_mc_find_devfor
ip_setsockopt
ip_getsockopt
ip_build_xmit
ip_rt_event
ip_init
   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) module.
   7  *
   8  * Version:     @(#)ip.c        1.0.16b 9/1/93
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              
  19  *
  20  * Fixes:
  21  *              Alan Cox        :       Commented a couple of minor bits of surplus code
  22  *              Alan Cox        :       Undefining IP_FORWARD doesn't include the code
  23  *                                      (just stops a compiler warning).
  24  *              Alan Cox        :       Frames with >=MAX_ROUTE record routes, strict routes or loose routes
  25  *                                      are junked rather than corrupting things.
  26  *              Alan Cox        :       Frames to bad broadcast subnets are dumped
  27  *                                      We used to process them non broadcast and
  28  *                                      boy could that cause havoc.
  29  *              Alan Cox        :       ip_forward sets the free flag on the
  30  *                                      new frame it queues. Still crap because
  31  *                                      it copies the frame but at least it
  32  *                                      doesn't eat memory too.
  33  *              Alan Cox        :       Generic queue code and memory fixes.
  34  *              Fred Van Kempen :       IP fragment support (borrowed from NET2E)
  35  *              Gerhard Koerting:       Forward fragmented frames correctly.
  36  *              Gerhard Koerting:       Fixes to my fix of the above 8-).
  37  *              Gerhard Koerting:       IP interface addressing fix.
  38  *              Linus Torvalds  :       More robustness checks
  39  *              Alan Cox        :       Even more checks: Still not as robust as it ought to be
  40  *              Alan Cox        :       Save IP header pointer for later
  41  *              Alan Cox        :       ip option setting
  42  *              Alan Cox        :       Use ip_tos/ip_ttl settings
  43  *              Alan Cox        :       Fragmentation bogosity removed
  44  *                                      (Thanks to Mark.Bush@prg.ox.ac.uk)
  45  *              Dmitry Gorodchanin :    Send of a raw packet crash fix.
  46  *              Alan Cox        :       Silly ip bug when an overlength
  47  *                                      fragment turns up. Now frees the
  48  *                                      queue.
  49  *              Linus Torvalds/ :       Memory leakage on fragmentation
  50  *              Alan Cox        :       handling.
  51  *              Gerhard Koerting:       Forwarding uses IP priority hints
  52  *              Teemu Rantanen  :       Fragment problems.
  53  *              Alan Cox        :       General cleanup, comments and reformat
  54  *              Alan Cox        :       SNMP statistics
  55  *              Alan Cox        :       BSD address rule semantics. Also see
  56  *                                      UDP as there is a nasty checksum issue
  57  *                                      if you do things the wrong way.
  58  *              Alan Cox        :       Always defrag, moved IP_FORWARD to the config.in file
  59  *              Alan Cox        :       IP options adjust sk->priority.
  60  *              Pedro Roque     :       Fix mtu/length error in ip_forward.
  61  *              Alan Cox        :       Avoid ip_chk_addr when possible.
  62  *      Richard Underwood       :       IP multicasting.
  63  *              Alan Cox        :       Cleaned up multicast handlers.
  64  *              Alan Cox        :       RAW sockets demultiplex in the BSD style.
  65  *              Gunther Mayer   :       Fix the SNMP reporting typo
  66  *              Alan Cox        :       Always in group 224.0.0.1
  67  *      Pauline Middelink       :       Fast ip_checksum update when forwarding
  68  *                                      Masquerading support.
  69  *              Alan Cox        :       Multicast loopback error for 224.0.0.1
  70  *              Alan Cox        :       IP_MULTICAST_LOOP option.
  71  *              Alan Cox        :       Use notifiers.
  72  *              Bjorn Ekwall    :       Removed ip_csum (from slhc.c too)
  73  *              Bjorn Ekwall    :       Moved ip_fast_csum to ip.h (inline!)
  74  *              Stefan Becker   :       Send out ICMP HOST REDIRECT
  75  *      Arnt Gulbrandsen        :       ip_build_xmit
  76  *              Alan Cox        :       Per socket routing cache
  77  *              Alan Cox        :       Fixed routing cache, added header cache.
  78  *              Alan Cox        :       Loopback didnt work right in original ip_build_xmit - fixed it.
  79  *              Alan Cox        :       Only send ICMP_REDIRECT if src/dest are the same net.
  80  *              Alan Cox        :       Incoming IP option handling.
  81  *              Alan Cox        :       Set saddr on raw output frames as per BSD.
  82  *              Alan Cox        :       Stopped broadcast source route explosions.
  83  *              Alan Cox        :       Can disable source routing
  84  *
  85  *  
  86  *
  87  * To Fix:
  88  *              IP option processing is mostly not needed. ip_forward needs to know about routing rules
  89  *              and time stamp but that's about all. Use the route mtu field here too
  90  *              IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
  91  *              and could be made very efficient with the addition of some virtual memory hacks to permit
  92  *              the allocation of a buffer that can then be 'grown' by twiddling page tables.
  93  *              Output fragmentation wants updating along with the buffer management to use a single 
  94  *              interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
  95  *              output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
  96  *              fragmentation anyway.
  97  *
  98  *              FIXME: copy frag 0 iph to qp->iph
  99  *
 100  *              This program is free software; you can redistribute it and/or
 101  *              modify it under the terms of the GNU General Public License
 102  *              as published by the Free Software Foundation; either version
 103  *              2 of the License, or (at your option) any later version.
 104  */
 105 
 106 #include <asm/segment.h>
 107 #include <asm/system.h>
 108 #include <linux/types.h>
 109 #include <linux/kernel.h>
 110 #include <linux/sched.h>
 111 #include <linux/mm.h>
 112 #include <linux/string.h>
 113 #include <linux/errno.h>
 114 #include <linux/config.h>
 115 
 116 #include <linux/socket.h>
 117 #include <linux/sockios.h>
 118 #include <linux/in.h>
 119 #include <linux/inet.h>
 120 #include <linux/netdevice.h>
 121 #include <linux/etherdevice.h>
 122 
 123 #include <net/snmp.h>
 124 #include <net/ip.h>
 125 #include <net/protocol.h>
 126 #include <net/route.h>
 127 #include <net/tcp.h>
 128 #include <net/udp.h>
 129 #include <linux/skbuff.h>
 130 #include <net/sock.h>
 131 #include <net/arp.h>
 132 #include <net/icmp.h>
 133 #include <net/raw.h>
 134 #include <net/checksum.h>
 135 #include <linux/igmp.h>
 136 #include <linux/ip_fw.h>
 137 
 138 #define CONFIG_IP_DEFRAG
 139 
 140 extern int last_retran;
 141 extern void sort_send(struct sock *sk);
 142 
 143 #define min(a,b)        ((a)<(b)?(a):(b))
 144 #define LOOPBACK(x)     (((x) & htonl(0xff000000)) == htonl(0x7f000000))
 145 
 146 /*
 147  *      SNMP management statistics
 148  */
 149 
 150 #ifdef CONFIG_IP_FORWARD
 151 struct ip_mib ip_statistics={1,64,};    /* Forwarding=Yes, Default TTL=64 */
 152 #else
 153 struct ip_mib ip_statistics={0,64,};    /* Forwarding=No, Default TTL=64 */
 154 #endif
 155 
 156 /*
 157  *      Handle the issuing of an ioctl() request
 158  *      for the ip device. This is scheduled to
 159  *      disappear
 160  */
 161 
 162 int ip_ioctl(struct sock *sk, int cmd, unsigned long arg)
     /*  */
 163 {
 164         switch(cmd)
 165         {
 166                 default:
 167                         return(-EINVAL);
 168         }
 169 }
 170 
 171 
 172 /*
 173  *      Take an skb, and fill in the MAC header.
 174  */
 175 
 176 static int ip_send(struct sk_buff *skb, unsigned long daddr, int len, struct device *dev, unsigned long saddr)
     /*  */
 177 {
 178         int mac = 0;
 179 
 180         skb->dev = dev;
 181         skb->arp = 1;
 182         if (dev->hard_header)
 183         {
 184                 /*
 185                  *      Build a hardware header. Source address is our mac, destination unknown
 186                  *      (rebuild header will sort this out)
 187                  */
 188                 skb_reserve(skb,(dev->hard_header_len+15)&~15); /* 16 byte aligned IP headers are good */
 189                 mac = dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, len);
 190                 if (mac < 0)
 191                 {
 192                         mac = -mac;
 193                         skb->arp = 0;
 194                         skb->raddr = daddr;     /* next routing address */
 195                 }
 196         }
 197         return mac;
 198 }
 199 
 200 int ip_id_count = 0;
 201 
 202 /*
 203  * This routine builds the appropriate hardware/IP headers for
 204  * the routine.  It assumes that if *dev != NULL then the
 205  * protocol knows what it's doing, otherwise it uses the
 206  * routing/ARP tables to select a device struct.
 207  */
 208 int ip_build_header(struct sk_buff *skb, unsigned long saddr, unsigned long daddr,
     /*  */
 209                 struct device **dev, int type, struct options *opt, int len, int tos, int ttl)
 210 {
 211         struct rtable *rt;
 212         unsigned long raddr;
 213         int tmp;
 214         unsigned long src;
 215         struct iphdr *iph;
 216 
 217         /*
 218          *      See if we need to look up the device.
 219          */
 220 
 221 #ifdef CONFIG_INET_MULTICAST    
 222         if(MULTICAST(daddr) && *dev==NULL && skb->sk && *skb->sk->ip_mc_name)
 223                 *dev=dev_get(skb->sk->ip_mc_name);
 224 #endif
 225         if (*dev == NULL)
 226         {
 227                 if(skb->localroute)
 228                         rt = ip_rt_local(daddr, NULL, &src);
 229                 else
 230                         rt = ip_rt_route(daddr, NULL, &src);
 231                 if (rt == NULL)
 232                 {
 233                         ip_statistics.IpOutNoRoutes++;
 234                         return(-ENETUNREACH);
 235                 }
 236 
 237                 *dev = rt->rt_dev;
 238                 /*
 239                  *      If the frame is from us and going off machine it MUST MUST MUST
 240                  *      have the output device ip address and never the loopback
 241                  */
 242                 if (LOOPBACK(saddr) && !LOOPBACK(daddr))
 243                         saddr = src;/*rt->rt_dev->pa_addr;*/
 244                 raddr = rt->rt_gateway;
 245 
 246         }
 247         else
 248         {
 249                 /*
 250                  *      We still need the address of the first hop.
 251                  */
 252                 if(skb->localroute)
 253                         rt = ip_rt_local(daddr, NULL, &src);
 254                 else
 255                         rt = ip_rt_route(daddr, NULL, &src);
 256                 /*
 257                  *      If the frame is from us and going off machine it MUST MUST MUST
 258                  *      have the output device ip address and never the loopback
 259                  */
 260                 if (LOOPBACK(saddr) && !LOOPBACK(daddr))
 261                         saddr = src;/*rt->rt_dev->pa_addr;*/
 262 
 263                 raddr = (rt == NULL) ? 0 : rt->rt_gateway;
 264         }
 265 
 266         /*
 267          *      No source addr so make it our addr
 268          */
 269         if (saddr == 0)
 270                 saddr = src;
 271 
 272         /*
 273          *      No gateway so aim at the real destination
 274          */
 275         if (raddr == 0)
 276                 raddr = daddr;
 277 
 278         /*
 279          *      Now build the MAC header.
 280          */
 281 
 282         tmp = ip_send(skb, raddr, len, *dev, saddr);
 283 
 284         /*
 285          *      Book keeping
 286          */
 287 
 288         skb->dev = *dev;
 289         skb->saddr = saddr;
 290         if (skb->sk)
 291                 skb->sk->saddr = saddr;
 292 
 293         /*
 294          *      Now build the IP header.
 295          */
 296 
 297         /*
 298          *      If we are using IPPROTO_RAW, then we don't need an IP header, since
 299          *      one is being supplied to us by the user
 300          */
 301 
 302         if(type == IPPROTO_RAW)
 303                 return (tmp);
 304 
 305         /*
 306          *      Build the IP addresses
 307          */
 308          
 309         iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr));
 310 
 311         iph->version  = 4;
 312         iph->ihl      = 5;
 313         iph->tos      = tos;
 314         iph->frag_off = 0;
 315         iph->ttl      = ttl;
 316         iph->daddr    = daddr;
 317         iph->saddr    = saddr;
 318         iph->protocol = type;
 319         skb->ip_hdr   = iph;
 320 
 321         return(20 + tmp);       /* IP header plus MAC header size */
 322 }
 323 
 324 
 325 /*
 326  *      Generate a checksum for an outgoing IP datagram.
 327  */
 328 
 329 void ip_send_check(struct iphdr *iph)
     /*  */
 330 {
 331         iph->check = 0;
 332         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 333 }
 334 
 335 /************************ Fragment Handlers From NET2E **********************************/
 336 
 337 
 338 /*
 339  *      This fragment handler is a bit of a heap. On the other hand it works quite
 340  *      happily and handles things quite well.
 341  */
 342 
 343 static struct ipq *ipqueue = NULL;              /* IP fragment queue    */
 344 
 345 /*
 346  *      Create a new fragment entry.
 347  */
 348 
 349 static struct ipfrag *ip_frag_create(int offset, int end, struct sk_buff *skb, unsigned char *ptr)
     /*  */
 350 {
 351         struct ipfrag *fp;
 352 
 353         fp = (struct ipfrag *) kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
 354         if (fp == NULL)
 355         {
 356                 NETDEBUG(printk("IP: frag_create: no memory left !\n"));
 357                 return(NULL);
 358         }
 359         memset(fp, 0, sizeof(struct ipfrag));
 360 
 361         /* Fill in the structure. */
 362         fp->offset = offset;
 363         fp->end = end;
 364         fp->len = end - offset;
 365         fp->skb = skb;
 366         fp->ptr = ptr;
 367 
 368         return(fp);
 369 }
 370 
 371 
 372 /*
 373  *      Find the correct entry in the "incomplete datagrams" queue for
 374  *      this IP datagram, and return the queue entry address if found.
 375  */
 376 
 377 static struct ipq *ip_find(struct iphdr *iph)
     /*  */
 378 {
 379         struct ipq *qp;
 380         struct ipq *qplast;
 381 
 382         cli();
 383         qplast = NULL;
 384         for(qp = ipqueue; qp != NULL; qplast = qp, qp = qp->next)
 385         {
 386                 if (iph->id== qp->iph->id && iph->saddr == qp->iph->saddr &&
 387                         iph->daddr == qp->iph->daddr && iph->protocol == qp->iph->protocol)
 388                 {
 389                         del_timer(&qp->timer);  /* So it doesn't vanish on us. The timer will be reset anyway */
 390                         sti();
 391                         return(qp);
 392                 }
 393         }
 394         sti();
 395         return(NULL);
 396 }
 397 
 398 
 399 /*
 400  *      Remove an entry from the "incomplete datagrams" queue, either
 401  *      because we completed, reassembled and processed it, or because
 402  *      it timed out.
 403  */
 404 
 405 static void ip_free(struct ipq *qp)
     /*  */
 406 {
 407         struct ipfrag *fp;
 408         struct ipfrag *xp;
 409 
 410         /*
 411          * Stop the timer for this entry.
 412          */
 413 
 414         del_timer(&qp->timer);
 415 
 416         /* Remove this entry from the "incomplete datagrams" queue. */
 417         cli();
 418         if (qp->prev == NULL)
 419         {
 420                 ipqueue = qp->next;
 421                 if (ipqueue != NULL)
 422                         ipqueue->prev = NULL;
 423         }
 424         else
 425         {
 426                 qp->prev->next = qp->next;
 427                 if (qp->next != NULL)
 428                         qp->next->prev = qp->prev;
 429         }
 430 
 431         /* Release all fragment data. */
 432 
 433         fp = qp->fragments;
 434         while (fp != NULL)
 435         {
 436                 xp = fp->next;
 437                 IS_SKB(fp->skb);
 438                 kfree_skb(fp->skb,FREE_READ);
 439                 kfree_s(fp, sizeof(struct ipfrag));
 440                 fp = xp;
 441         }
 442 
 443         /* Release the IP header. */
 444         kfree_s(qp->iph, 64 + 8);
 445 
 446         /* Finally, release the queue descriptor itself. */
 447         kfree_s(qp, sizeof(struct ipq));
 448         sti();
 449 }
 450 
 451 
 452 /*
 453  *      Oops- a fragment queue timed out.  Kill it and send an ICMP reply.
 454  */
 455 
 456 static void ip_expire(unsigned long arg)
     /*  */
 457 {
 458         struct ipq *qp;
 459 
 460         qp = (struct ipq *)arg;
 461 
 462         /*
 463          *      Send an ICMP "Fragment Reassembly Timeout" message.
 464          */
 465 
 466         ip_statistics.IpReasmTimeout++;
 467         ip_statistics.IpReasmFails++;   
 468         /* This if is always true... shrug */
 469         if(qp->fragments!=NULL)
 470                 icmp_send(qp->fragments->skb,ICMP_TIME_EXCEEDED,
 471                                 ICMP_EXC_FRAGTIME, 0, qp->dev);
 472 
 473         /*
 474          *      Nuke the fragment queue.
 475          */
 476         ip_free(qp);
 477 }
 478 
 479 
 480 /*
 481  *      Add an entry to the 'ipq' queue for a newly received IP datagram.
 482  *      We will (hopefully :-) receive all other fragments of this datagram
 483  *      in time, so we just create a queue for this datagram, in which we
 484  *      will insert the received fragments at their respective positions.
 485  */
 486 
 487 static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph, struct device *dev)
     /*  */
 488 {
 489         struct ipq *qp;
 490         int ihlen;
 491 
 492         qp = (struct ipq *) kmalloc(sizeof(struct ipq), GFP_ATOMIC);
 493         if (qp == NULL)
 494         {
 495                 NETDEBUG(printk("IP: create: no memory left !\n"));
 496                 return(NULL);
 497                 skb->dev = qp->dev;
 498         }
 499         memset(qp, 0, sizeof(struct ipq));
 500 
 501         /*
 502          *      Allocate memory for the IP header (plus 8 octets for ICMP).
 503          */
 504 
 505         ihlen = iph->ihl * 4;
 506         qp->iph = (struct iphdr *) kmalloc(64 + 8, GFP_ATOMIC);
 507         if (qp->iph == NULL)
 508         {
 509                 NETDEBUG(printk("IP: create: no memory left !\n"));
 510                 kfree_s(qp, sizeof(struct ipq));
 511                 return(NULL);
 512         }
 513 
 514         memcpy(qp->iph, iph, ihlen + 8);
 515         qp->len = 0;
 516         qp->ihlen = ihlen;
 517         qp->fragments = NULL;
 518         qp->dev = dev;
 519 
 520         /* Start a timer for this entry. */
 521         qp->timer.expires = IP_FRAG_TIME;               /* about 30 seconds     */
 522         qp->timer.data = (unsigned long) qp;            /* pointer to queue     */
 523         qp->timer.function = ip_expire;                 /* expire function      */
 524         add_timer(&qp->timer);
 525 
 526         /* Add this entry to the queue. */
 527         qp->prev = NULL;
 528         cli();
 529         qp->next = ipqueue;
 530         if (qp->next != NULL)
 531                 qp->next->prev = qp;
 532         ipqueue = qp;
 533         sti();
 534         return(qp);
 535 }
 536 
 537 
 538 /*
 539  *      See if a fragment queue is complete.
 540  */
 541 
 542 static int ip_done(struct ipq *qp)
     /*  */
 543 {
 544         struct ipfrag *fp;
 545         int offset;
 546 
 547         /* Only possible if we received the final fragment. */
 548         if (qp->len == 0)
 549                 return(0);
 550 
 551         /* Check all fragment offsets to see if they connect. */
 552         fp = qp->fragments;
 553         offset = 0;
 554         while (fp != NULL)
 555         {
 556                 if (fp->offset > offset)
 557                         return(0);      /* fragment(s) missing */
 558                 offset = fp->end;
 559                 fp = fp->next;
 560         }
 561 
 562         /* All fragments are present. */
 563         return(1);
 564 }
 565 
 566 
 567 /*
 568  *      Build a new IP datagram from all its fragments.
 569  *
 570  *      FIXME: We copy here because we lack an effective way of handling lists
 571  *      of bits on input. Until the new skb data handling is in I'm not going
 572  *      to touch this with a bargepole. 
 573  */
 574 
 575 static struct sk_buff *ip_glue(struct ipq *qp)
     /*  */
 576 {
 577         struct sk_buff *skb;
 578         struct iphdr *iph;
 579         struct ipfrag *fp;
 580         unsigned char *ptr;
 581         int count, len;
 582 
 583         /*
 584          *      Allocate a new buffer for the datagram.
 585          */
 586         len = qp->ihlen + qp->len;
 587 
 588         if ((skb = dev_alloc_skb(len)) == NULL)
 589         {
 590                 ip_statistics.IpReasmFails++;
 591                 NETDEBUG(printk("IP: queue_glue: no memory for gluing queue %p\n", qp));
 592                 ip_free(qp);
 593                 return(NULL);
 594         }
 595 
 596         /* Fill in the basic details. */
 597         skb_put(skb,len);
 598         skb->h.raw = skb->data;
 599         skb->free = 1;
 600 
 601         /* Copy the original IP headers into the new buffer. */
 602         ptr = (unsigned char *) skb->h.raw;
 603         memcpy(ptr, ((unsigned char *) qp->iph), qp->ihlen);
 604         ptr += qp->ihlen;
 605 
 606         count = 0;
 607 
 608         /* Copy the data portions of all fragments into the new buffer. */
 609         fp = qp->fragments;
 610         while(fp != NULL)
 611         {
 612                 if(count+fp->len > skb->len)
 613                 {
 614                         NETDEBUG(printk("Invalid fragment list: Fragment over size.\n"));
 615                         ip_free(qp);
 616                         kfree_skb(skb,FREE_WRITE);
 617                         ip_statistics.IpReasmFails++;
 618                         return NULL;
 619                 }
 620                 memcpy((ptr + fp->offset), fp->ptr, fp->len);
 621                 count += fp->len;
 622                 fp = fp->next;
 623         }
 624 
 625         /* We glued together all fragments, so remove the queue entry. */
 626         ip_free(qp);
 627 
 628         /* Done with all fragments. Fixup the new IP header. */
 629         iph = skb->h.iph;
 630         iph->frag_off = 0;
 631         iph->tot_len = htons((iph->ihl * 4) + count);
 632         skb->ip_hdr = iph;
 633 
 634         ip_statistics.IpReasmOKs++;
 635         return(skb);
 636 }
 637 
 638 
 639 /*
 640  *      Process an incoming IP datagram fragment.
 641  */
 642 
 643 static struct sk_buff *ip_defrag(struct iphdr *iph, struct sk_buff *skb, struct device *dev)
     /*  */
 644 {
 645         struct ipfrag *prev, *next, *tmp;
 646         struct ipfrag *tfp;
 647         struct ipq *qp;
 648         struct sk_buff *skb2;
 649         unsigned char *ptr;
 650         int flags, offset;
 651         int i, ihl, end;
 652 
 653         ip_statistics.IpReasmReqds++;
 654 
 655         /* Find the entry of this IP datagram in the "incomplete datagrams" queue. */
 656         qp = ip_find(iph);
 657 
 658         /* Is this a non-fragmented datagram? */
 659         offset = ntohs(iph->frag_off);
 660         flags = offset & ~IP_OFFSET;
 661         offset &= IP_OFFSET;
 662         if (((flags & IP_MF) == 0) && (offset == 0))
 663         {
 664                 if (qp != NULL)
 665                         ip_free(qp);    /* Huh? How could this exist?? */
 666                 return(skb);
 667         }
 668 
 669         offset <<= 3;           /* offset is in 8-byte chunks */
 670 
 671         /*
 672          * If the queue already existed, keep restarting its timer as long
 673          * as we still are receiving fragments.  Otherwise, create a fresh
 674          * queue entry.
 675          */
 676 
 677         if (qp != NULL)
 678         {
 679                 del_timer(&qp->timer);
 680                 qp->timer.expires = IP_FRAG_TIME;       /* about 30 seconds */
 681                 qp->timer.data = (unsigned long) qp;    /* pointer to queue */
 682                 qp->timer.function = ip_expire;         /* expire function */
 683                 add_timer(&qp->timer);
 684         }
 685         else
 686         {
 687                 /*
 688                  *      If we failed to create it, then discard the frame
 689                  */
 690                 if ((qp = ip_create(skb, iph, dev)) == NULL)
 691                 {
 692                         skb->sk = NULL;
 693                         kfree_skb(skb, FREE_READ);
 694                         ip_statistics.IpReasmFails++;
 695                         return NULL;
 696                 }
 697         }
 698 
 699         /*
 700          *      Determine the position of this fragment.
 701          */
 702 
 703         ihl = iph->ihl * 4;
 704         end = offset + ntohs(iph->tot_len) - ihl;
 705 
 706         /*
 707          *      Point into the IP datagram 'data' part.
 708          */
 709 
 710         ptr = skb->data + ihl;
 711 
 712         /*
 713          *      Is this the final fragment?
 714          */
 715 
 716         if ((flags & IP_MF) == 0)
 717                 qp->len = end;
 718 
 719         /*
 720          *      Find out which fragments are in front and at the back of us
 721          *      in the chain of fragments so far.  We must know where to put
 722          *      this fragment, right?
 723          */
 724 
 725         prev = NULL;
 726         for(next = qp->fragments; next != NULL; next = next->next)
 727         {
 728                 if (next->offset > offset)
 729                         break;  /* bingo! */
 730                 prev = next;
 731         }
 732 
 733         /*
 734          *      We found where to put this one.
 735          *      Check for overlap with preceding fragment, and, if needed,
 736          *      align things so that any overlaps are eliminated.
 737          */
 738         if (prev != NULL && offset < prev->end)
 739         {
 740                 i = prev->end - offset;
 741                 offset += i;    /* ptr into datagram */
 742                 ptr += i;       /* ptr into fragment data */
 743         }
 744 
 745         /*
 746          * Look for overlap with succeeding segments.
 747          * If we can merge fragments, do it.
 748          */
 749 
 750         for(tmp=next; tmp != NULL; tmp = tfp)
 751         {
 752                 tfp = tmp->next;
 753                 if (tmp->offset >= end)
 754                         break;          /* no overlaps at all */
 755 
 756                 i = end - next->offset;                 /* overlap is 'i' bytes */
 757                 tmp->len -= i;                          /* so reduce size of    */
 758                 tmp->offset += i;                       /* next fragment        */
 759                 tmp->ptr += i;
 760                 /*
 761                  *      If we get a frag size of <= 0, remove it and the packet
 762                  *      that it goes with.
 763                  */
 764                 if (tmp->len <= 0)
 765                 {
 766                         if (tmp->prev != NULL)
 767                                 tmp->prev->next = tmp->next;
 768                         else
 769                                 qp->fragments = tmp->next;
 770 
 771                         if (tfp->next != NULL)
 772                                 tmp->next->prev = tmp->prev;
 773                         
 774                         next=tfp;       /* We have killed the original next frame */
 775 
 776                         kfree_skb(tmp->skb,FREE_READ);
 777                         kfree_s(tmp, sizeof(struct ipfrag));
 778                 }
 779         }
 780 
 781         /*
 782          *      Insert this fragment in the chain of fragments.
 783          */
 784 
 785         tfp = NULL;
 786         tfp = ip_frag_create(offset, end, skb, ptr);
 787 
 788         /*
 789          *      No memory to save the fragment - so throw the lot
 790          */
 791 
 792         if (!tfp)
 793         {
 794                 skb->sk = NULL;
 795                 kfree_skb(skb, FREE_READ);
 796                 return NULL;
 797         }
 798         tfp->prev = prev;
 799         tfp->next = next;
 800         if (prev != NULL)
 801                 prev->next = tfp;
 802         else
 803                 qp->fragments = tfp;
 804 
 805         if (next != NULL)
 806                 next->prev = tfp;
 807 
 808         /*
 809          *      OK, so we inserted this new fragment into the chain.
 810          *      Check if we now have a full IP datagram which we can
 811          *      bump up to the IP layer...
 812          */
 813 
 814         if (ip_done(qp))
 815         {
 816                 skb2 = ip_glue(qp);             /* glue together the fragments */
 817                 return(skb2);
 818         }
 819         return(NULL);
 820 }
 821 
 822 
 823 /*
 824  *      This IP datagram is too large to be sent in one piece.  Break it up into
 825  *      smaller pieces (each of size equal to the MAC header plus IP header plus
 826  *      a block of the data of the original IP data part) that will yet fit in a
 827  *      single device frame, and queue such a frame for sending by calling the
 828  *      ip_queue_xmit().  Note that this is recursion, and bad things will happen
 829  *      if this function causes a loop...
 830  *
 831  *      Yes this is inefficient, feel free to submit a quicker one.
 832  *
 833  *      **Protocol Violation**
 834  *      We copy all the options to each fragment. !FIXME!
 835  */
 836  
 837 void ip_fragment(struct sock *sk, struct sk_buff *skb, struct device *dev, int is_frag)
     /*  */
 838 {
 839         struct iphdr *iph;
 840         unsigned char *raw;
 841         unsigned char *ptr;
 842         struct sk_buff *skb2;
 843         int left, mtu, hlen, len;
 844         int offset;
 845         unsigned long flags;
 846 
 847         /*
 848          *      Point into the IP datagram header.
 849          */
 850 
 851         raw = skb->data;
 852         iph = (struct iphdr *) (raw + dev->hard_header_len);
 853 
 854         skb->ip_hdr = iph;
 855 
 856         /*
 857          *      Setup starting values.
 858          */
 859 
 860         hlen = iph->ihl * 4;
 861         left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 862         hlen += dev->hard_header_len;           /* Total header size */
 863         mtu = (dev->mtu - hlen);                /* Size of data space */
 864         ptr = (raw + hlen);                     /* Where to start from */
 865 
 866         /*
 867          *      Check for any "DF" flag. [DF means do not fragment]
 868          */
 869 
 870         if (ntohs(iph->frag_off) & IP_DF)
 871         {
 872                 /*
 873                  *      Reply giving the MTU of the failed hop.
 874                  */
 875                 ip_statistics.IpFragFails++;
 876                 icmp_send(skb,ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, dev->mtu, dev);
 877                 return;
 878         }
 879 
 880         /*
 881          *      The protocol doesn't seem to say what to do in the case that the
 882          *      frame + options doesn't fit the mtu. As it used to fall down dead
 883          *      in this case we were fortunate it didn't happen
 884          */
 885 
 886         if(mtu<8)
 887         {
 888                 /* It's wrong but it's better than nothing */
 889                 icmp_send(skb,ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED,dev->mtu, dev);
 890                 ip_statistics.IpFragFails++;
 891                 return;
 892         }
 893 
 894         /*
 895          *      Fragment the datagram.
 896          */
 897 
 898         /*
 899          *      The initial offset is 0 for a complete frame. When
 900          *      fragmenting fragments it's wherever this one starts.
 901          */
 902 
 903         if (is_frag & 2)
 904                 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 905         else
 906                 offset = 0;
 907 
 908 
 909         /*
 910          *      Keep copying data until we run out.
 911          */
 912 
 913         while(left > 0)
 914         {
 915                 len = left;
 916                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 917                 if (len > mtu)
 918                         len = mtu;
 919                 /* IF: we are not sending upto and including the packet end
 920                    then align the next start on an eight byte boundary */
 921                 if (len < left)
 922                 {
 923                         len/=8;
 924                         len*=8;
 925                 }
 926                 /*
 927                  *      Allocate buffer.
 928                  */
 929 
 930                 if ((skb2 = alloc_skb(len + hlen+15,GFP_ATOMIC)) == NULL)
 931                 {
 932                         NETDEBUG(printk("IP: frag: no memory for new fragment!\n"));
 933                         ip_statistics.IpFragFails++;
 934                         return;
 935                 }
 936 
 937                 /*
 938                  *      Set up data on packet
 939                  */
 940 
 941                 skb2->arp = skb->arp;
 942                 if(skb->free==0)
 943                         printk("IP fragmenter: BUG free!=1 in fragmenter\n");
 944                 skb2->free = 1;
 945                 skb_put(skb2,len + hlen);
 946                 skb2->h.raw=(char *) skb2->data;
 947                 /*
 948                  *      Charge the memory for the fragment to any owner
 949                  *      it might possess
 950                  */
 951 
 952                 save_flags(flags);
 953                 if (sk)
 954                 {
 955                         cli();
 956                         sk->wmem_alloc += skb2->truesize;
 957                         skb2->sk=sk;
 958                 }
 959                 restore_flags(flags);
 960                 skb2->raddr = skb->raddr;       /* For rebuild_header - must be here */
 961 
 962                 /*
 963                  *      Copy the packet header into the new buffer.
 964                  */
 965 
 966                 memcpy(skb2->h.raw, raw, hlen);
 967 
 968                 /*
 969                  *      Copy a block of the IP datagram.
 970                  */
 971                 memcpy(skb2->h.raw + hlen, ptr, len);
 972                 left -= len;
 973 
 974                 skb2->h.raw+=dev->hard_header_len;
 975 
 976                 /*
 977                  *      Fill in the new header fields.
 978                  */
 979                 iph = (struct iphdr *)(skb2->h.raw/*+dev->hard_header_len*/);
 980                 iph->frag_off = htons((offset >> 3));
 981                 /*
 982                  *      Added AC : If we are fragmenting a fragment thats not the
 983                  *                 last fragment then keep MF on each bit
 984                  */
 985                 if (left > 0 || (is_frag & 1))
 986                         iph->frag_off |= htons(IP_MF);
 987                 ptr += len;
 988                 offset += len;
 989 
 990                 /*
 991                  *      Put this fragment into the sending queue.
 992                  */
 993 
 994                 ip_statistics.IpFragCreates++;
 995 
 996                 ip_queue_xmit(sk, dev, skb2, 2);
 997         }
 998         ip_statistics.IpFragOKs++;
 999 }
1000 
1001 
1002 
1003 #ifdef CONFIG_IP_FORWARD
1004 
1005 /*
1006  *      Forward an IP datagram to its next destination.
1007  */
1008 
1009 void ip_forward(struct sk_buff *skb, struct device *dev, int is_frag, unsigned long target_addr, int target_strict)
     /*  */
1010 {
1011         struct device *dev2;    /* Output device */
1012         struct iphdr *iph;      /* Our header */
1013         struct sk_buff *skb2;   /* Output packet */
1014         struct rtable *rt;      /* Route we use */
1015         unsigned char *ptr;     /* Data pointer */
1016         unsigned long raddr;    /* Router IP address */
1017 #ifdef CONFIG_IP_FIREWALL
1018         int fw_res = 0;         /* Forwarding result */ 
1019         
1020         /* 
1021          *      See if we are allowed to forward this.
1022          *      Note: demasqueraded fragments are always 'back'warded.
1023          */
1024 
1025         
1026         if(!(is_frag&4) && (fw_res=ip_fw_chk(skb->h.iph, dev, ip_fw_fwd_chain, ip_fw_fwd_policy, 0))!=1)
1027         {
1028                 if(fw_res==-1)
1029                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev);
1030                 return;
1031         }
1032 #endif
1033         /*
1034          *      According to the RFC, we must first decrease the TTL field. If
1035          *      that reaches zero, we must reply an ICMP control message telling
1036          *      that the packet's lifetime expired.
1037          *
1038          *      Exception:
1039          *      We may not generate an ICMP for an ICMP. icmp_send does the
1040          *      enforcement of this so we can forget it here. It is however
1041          *      sometimes VERY important.
1042          */
1043 
1044         iph = skb->h.iph;
1045         iph->ttl--;
1046 
1047         /*
1048          *      Re-compute the IP header checksum.
1049          *      This is inefficient. We know what has happened to the header
1050          *      and could thus adjust the checksum as Phil Karn does in KA9Q
1051          */
1052 
1053         iph->check = ntohs(iph->check) + 0x0100;
1054         if ((iph->check & 0xFF00) == 0)
1055                 iph->check++;           /* carry overflow */
1056         iph->check = htons(iph->check);
1057 
1058         if (iph->ttl <= 0)
1059         {
1060                 /* Tell the sender its packet died... */
1061                 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0, dev);
1062                 return;
1063         }
1064 
1065         /*
1066          * OK, the packet is still valid.  Fetch its destination address,
1067          * and give it to the IP sender for further processing.
1068          */
1069 
1070         rt = ip_rt_route(target_addr, NULL, NULL);
1071         if (rt == NULL)
1072         {
1073                 /*
1074                  *      Tell the sender its packet cannot be delivered. Again
1075                  *      ICMP is screened later.
1076                  */
1077                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0, dev);
1078                 return;
1079         }
1080 
1081 
1082         /*
1083          * Gosh.  Not only is the packet valid; we even know how to
1084          * forward it onto its final destination.  Can we say this
1085          * is being plain lucky?
1086          * If the router told us that there is no GW, use the dest.
1087          * IP address itself- we seem to be connected directly...
1088          */
1089 
1090         raddr = rt->rt_gateway;
1091 
1092         if (raddr != 0)
1093         {
1094                 /*
1095                  *      Strict routing permits no gatewaying
1096                  */
1097                 
1098                 if(target_strict)
1099                 {
1100                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0, dev);
1101                         kfree_skb(skb, FREE_READ);
1102                         return;
1103                 }
1104         
1105                 /*
1106                  *      There is a gateway so find the correct route for it.
1107                  *      Gateways cannot in turn be gatewayed.
1108                  */
1109 
1110                 rt = ip_rt_route(raddr, NULL, NULL);
1111                 if (rt == NULL)
1112                 {
1113                         /*
1114                          *      Tell the sender its packet cannot be delivered...
1115                          */
1116                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev);
1117                         return;
1118                 }
1119                 if (rt->rt_gateway != 0)
1120                         raddr = rt->rt_gateway;
1121         }
1122         else
1123                 raddr = target_addr;
1124 
1125         /*
1126          *      Having picked a route we can now send the frame out.
1127          */
1128 
1129         dev2 = rt->rt_dev;
1130 
1131         /*
1132          *      In IP you never have to forward a frame on the interface that it 
1133          *      arrived upon. We now generate an ICMP HOST REDIRECT giving the route
1134          *      we calculated.
1135          */
1136 #ifndef CONFIG_IP_NO_ICMP_REDIRECT
1137         if (dev == dev2 && !((iph->saddr^iph->daddr)&dev->pa_mask) && rt->rt_flags&RTF_MODIFIED)
1138                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, raddr, dev);
1139 #endif          
1140 
1141         /*
1142          * We now allocate a new buffer, and copy the datagram into it.
1143          * If the indicated interface is up and running, kick it.
1144          */
1145 
1146         if (dev2->flags & IFF_UP)
1147         {
1148 #ifdef CONFIG_IP_MASQUERADE
1149                 /*
1150                  * If this fragment needs masquerading, make it so...
1151                  * (Dont masquerade de-masqueraded fragments)
1152                  */
1153                 if (!(is_frag&4) && fw_res==2)
1154                         ip_fw_masquerade(&skb, dev2);
1155 #endif
1156 
1157                 /*
1158                  *      Current design decrees we copy the packet. For identical header
1159                  *      lengths we could avoid it. The new skb code will let us push
1160                  *      data so the problem goes away then.
1161                  */
1162 
1163                 skb2 = alloc_skb(dev2->hard_header_len + skb->len + 15, GFP_ATOMIC);
1164                 
1165                 /*
1166                  *      This is rare and since IP is tolerant of network failures
1167                  *      quite harmless.
1168                  */
1169                 
1170                 if (skb2 == NULL)
1171                 {
1172                         NETDEBUG(printk("\nIP: No memory available for IP forward\n"));
1173                         return;
1174                 }
1175                 
1176 
1177                 /* Now build the MAC header. */
1178                 (void) ip_send(skb2, raddr, skb->len, dev2, dev2->pa_addr);
1179 
1180                 ptr = skb_put(skb2,skb->len);
1181                 skb2->free = 1;
1182                 skb2->h.raw = ptr;
1183 
1184                 /*
1185                  *      Copy the packet data into the new buffer.
1186                  */
1187                 memcpy(ptr, skb->h.raw, skb->len);
1188 
1189 
1190                 ip_statistics.IpForwDatagrams++;
1191 
1192                 /*
1193                  *      See if it needs fragmenting. Note in ip_rcv we tagged
1194                  *      the fragment type. This must be right so that
1195                  *      the fragmenter does the right thing.
1196                  */
1197 
1198                 if(skb2->len > dev2->mtu + dev2->hard_header_len)
1199                 {
1200                         ip_fragment(NULL,skb2,dev2, is_frag);
1201                         kfree_skb(skb2,FREE_WRITE);
1202                 }
1203                 else
1204                 {
1205 #ifdef CONFIG_IP_ACCT           
1206                         /*
1207                          *      Count mapping we shortcut
1208                          */
1209                          
1210                         ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
1211 #endif                  
1212                         
1213                         /*
1214                          *      Map service types to priority. We lie about
1215                          *      throughput being low priority, but it's a good
1216                          *      choice to help improve general usage.
1217                          */
1218                         if(iph->tos & IPTOS_LOWDELAY)
1219                                 dev_queue_xmit(skb2, dev2, SOPRI_INTERACTIVE);
1220                         else if(iph->tos & IPTOS_THROUGHPUT)
1221                                 dev_queue_xmit(skb2, dev2, SOPRI_BACKGROUND);
1222                         else
1223                                 dev_queue_xmit(skb2, dev2, SOPRI_NORMAL);
1224                 }
1225         }
1226 }
1227 
1228 
1229 #endif
1230 
1231 /*
1232  *      This function receives all incoming IP datagrams.
1233  *
1234  *      On entry skb->data points to the start of the IP header and
1235  *      the MAC header has been removed.
1236  */
1237 
1238 int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
     /*  */
1239 {
1240         struct iphdr *iph = skb->h.iph;
1241         struct sock *raw_sk=NULL;
1242         unsigned char hash;
1243         unsigned char flag = 0;
1244         struct inet_protocol *ipprot;
1245         int brd=IS_MYADDR;
1246         unsigned long target_addr;
1247         int target_strict=0;
1248         int is_frag=0;
1249 #ifdef CONFIG_IP_FIREWALL
1250         int err;
1251 #endif  
1252 
1253         ip_statistics.IpInReceives++;
1254 
1255         /*
1256          *      Tag the ip header of this packet so we can find it
1257          */
1258 
1259         skb->ip_hdr = iph;
1260 
1261         /*
1262          *      RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
1263          *      RFC1122: 3.1.2.3 MUST discard a frame with invalid source address [NEEDS FIXING].
1264          *
1265          *      Is the datagram acceptable?
1266          *
1267          *      1.      Length at least the size of an ip header
1268          *      2.      Version of 4
1269          *      3.      Checksums correctly. [Speed optimisation for later, skip loopback checksums]
1270          *      4.      Doesn't have a bogus length
1271          *      (5.     We ought to check for IP multicast addresses and undefined types.. does this matter ?)
1272          */
1273 
1274         if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4 || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0
1275                 || skb->len < ntohs(iph->tot_len))
1276         {
1277                 ip_statistics.IpInHdrErrors++;
1278                 kfree_skb(skb, FREE_WRITE);
1279                 return(0);
1280         }
1281 
1282         /*
1283          *      Our transport medium may have padded the buffer out. Now we know it
1284          *      is IP we can trim to the true length of the frame.
1285          */
1286 
1287         skb_trim(skb,ntohs(iph->tot_len));
1288         
1289         /*
1290          *      See if the firewall wants to dispose of the packet. 
1291          */
1292 
1293 #ifdef  CONFIG_IP_FIREWALL
1294         
1295         if ((err=ip_fw_chk(iph,dev,ip_fw_blk_chain,ip_fw_blk_policy, 0))<1)
1296         {
1297                 if(err==-1)
1298                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev);
1299                 kfree_skb(skb, FREE_WRITE);
1300                 return 0;       
1301         }
1302 
1303 #endif
1304         
1305 
1306         /*
1307          *      Next analyse the packet for options. Studies show under one packet in
1308          *      a thousand have options....
1309          */
1310          
1311         target_addr = iph->daddr;
1312 
1313         if (iph->ihl != 5)
1314         { 
1315                 /* Humph.. options. Lots of annoying fiddly bits */
1316                 
1317                 /*
1318                  *      This is straight from the RFC. It might even be right ;)
1319                  *
1320                  *      RFC 1122: 3.2.1.8 STREAMID option is obsolete and MUST be ignored.
1321                  *      RFC 1122: 3.2.1.8 MUST NOT crash on a zero length option.
1322                  *      RFC 1122: 3.2.1.8 MUST support acting as final destination of a source route.
1323                  */
1324                  
1325                 int opt_space=4*(iph->ihl-5);
1326                 int opt_size;
1327                 unsigned char *opt_ptr=skb->h.raw+sizeof(struct iphdr);
1328         
1329                 skb->ip_summed=0;               /* Our free checksum is bogus for this case */
1330                         
1331                 while(opt_space>0)
1332                 {
1333                         if(*opt_ptr==IPOPT_NOOP)
1334                         {
1335                                 opt_ptr++;
1336                                 opt_space--;
1337                                 continue;
1338                         }
1339                         if(*opt_ptr==IPOPT_END)
1340                                 break;  /* Done */
1341                         if(opt_space<2 || (opt_size=opt_ptr[1])<2 || opt_ptr[1]>opt_space)
1342                         {
1343                                 /*
1344                                  *      RFC 1122: 3.2.2.5  SHOULD send parameter problem reports.
1345                                  */
1346                                 icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev);
1347                                 kfree_skb(skb, FREE_READ);
1348                                 return -EINVAL;
1349                         }
1350                         switch(opt_ptr[0])
1351                         {
1352                                 case IPOPT_SEC:
1353                                         /* Should we drop this ?? */
1354                                         break;
1355                                 case IPOPT_SSRR:        /* These work almost the same way */
1356                                         target_strict=1;
1357                                         /* Fall through */
1358                                 case IPOPT_LSRR:
1359 #ifdef CONFIG_IP_NOSR
1360                                         kfree_skb(skb, FREE_READ);
1361                                         return -EINVAL;
1362 #endif                                  
1363                                 case IPOPT_RR:
1364                                 /*
1365                                  *      RFC 1122: 3.2.1.8 Support for RR is OPTIONAL.
1366                                  */
1367                                         if (iph->daddr!=skb->dev->pa_addr && (brd = ip_chk_addr(iph->daddr)) == 0) 
1368                                                 break;
1369                                         if((opt_size<3) || ( opt_ptr[0]==IPOPT_RR && opt_ptr[2] > opt_size-4 ))
1370                                         {
1371                                                 if(ip_chk_addr(iph->daddr))
1372                                                         icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev);
1373                                                 kfree_skb(skb, FREE_READ);
1374                                                 return -EINVAL;
1375                                         }
1376                                         if(opt_ptr[2] > opt_size-4 )
1377                                                 break;
1378                                         /* Bytes are [IPOPT_xxRR][Length][EntryPointer][Entry0][Entry1].... */
1379                                         /* This isn't going to be too portable - FIXME */
1380                                         if(opt_ptr[0]!=IPOPT_RR)
1381                                         {
1382                                                 int t;
1383                                                 target_addr=*(u32 *)(&opt_ptr[opt_ptr[2]]);     /* Get hop */
1384                                                 t=ip_chk_addr(target_addr);
1385                                                 if(t==IS_MULTICAST||t==IS_BROADCAST)
1386                                                 {
1387                                                         if(ip_chk_addr(iph->daddr))
1388                                                                 icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev);
1389                                                         kfree_skb(skb,FREE_READ);
1390                                                         return -EINVAL;                                         
1391                                                 }
1392                                         }
1393                                         *(u32 *)(&opt_ptr[opt_ptr[2]])=skb->dev->pa_addr;       /* Record hop */
1394                                         break;
1395                                 case IPOPT_TIMESTAMP:
1396                                 /*
1397                                  *      RFC 1122: 3.2.1.8 The timestamp option is OPTIONAL but if implemented
1398                                  *      MUST meet various rules (read the spec).
1399                                  */
1400                                         NETDEBUG(printk("ICMP: Someone finish the timestamp routine ;)\n"));
1401                                         break;
1402                                 default:
1403                                         break;
1404                         }
1405                         opt_ptr+=opt_size;
1406                         opt_space-=opt_size;
1407                 }
1408                                         
1409         }
1410 
1411 
1412         /*
1413          *      Remember if the frame is fragmented.
1414          */
1415          
1416         if(iph->frag_off)
1417         {
1418                 if (iph->frag_off & htons(IP_MF))
1419                         is_frag|=1;
1420                 /*
1421                  *      Last fragment ?
1422                  */
1423         
1424                 if (iph->frag_off & htons(IP_OFFSET))
1425                         is_frag|=2;
1426         }
1427         
1428         /*
1429          *      Do any IP forwarding required.  chk_addr() is expensive -- avoid it someday.
1430          *
1431          *      This is inefficient. While finding out if it is for us we could also compute
1432          *      the routing table entry. This is where the great unified cache theory comes
1433          *      in as and when someone implements it
1434          *
1435          *      For most hosts over 99% of packets match the first conditional
1436          *      and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at
1437          *      function entry.
1438          */
1439 
1440         if ( iph->daddr == skb->dev->pa_addr || (brd = ip_chk_addr(iph->daddr)) != 0)
1441         {
1442 #ifdef CONFIG_IP_MULTICAST      
1443 
1444                 if(brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK))
1445                 {
1446                         /*
1447                          *      Check it is for one of our groups
1448                          */
1449                         struct ip_mc_list *ip_mc=dev->ip_mc_list;
1450                         do
1451                         {
1452                                 if(ip_mc==NULL)
1453                                 {       
1454                                         kfree_skb(skb, FREE_WRITE);
1455                                         return 0;
1456                                 }
1457                                 if(ip_mc->multiaddr==iph->daddr)
1458                                         break;
1459                                 ip_mc=ip_mc->next;
1460                         }
1461                         while(1);
1462                 }
1463 #endif
1464 
1465 #ifdef CONFIG_IP_MASQUERADE
1466                 /*
1467                  * Do we need to de-masquerade this fragment?
1468                  */
1469                 if (ip_fw_demasquerade(skb)) 
1470                 {
1471                         struct iphdr *iph=skb->h.iph;
1472                         ip_forward(skb, dev, is_frag|4, iph->daddr, 0);
1473                         kfree_skb(skb, FREE_WRITE);
1474                         return(0);
1475                 }
1476 #endif
1477 
1478                 /*
1479                  *      Account for the packet
1480                  */
1481  
1482 #ifdef CONFIG_IP_ACCT
1483                 ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
1484 #endif  
1485 
1486                 /*
1487                  *      Reassemble IP fragments.
1488                  */
1489 
1490                 if(is_frag)
1491                 {
1492                         /* Defragment. Obtain the complete packet if there is one */
1493                         skb=ip_defrag(iph,skb,dev);
1494                         if(skb==NULL)
1495                                 return 0;
1496                         skb->dev = dev;
1497                         iph=skb->h.iph;
1498                 }
1499 
1500                 /*
1501                  *      Point into the IP datagram, just past the header.
1502                  */
1503 
1504                 skb->ip_hdr = iph;
1505                 skb->h.raw += iph->ihl*4;
1506 
1507                 /*
1508                  *      Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies.
1509                  *
1510                  *      RFC 1122: SHOULD pass TOS value up to the transport layer.
1511                  */
1512  
1513                 hash = iph->protocol & (SOCK_ARRAY_SIZE-1);
1514 
1515                 /* 
1516                  *      If there maybe a raw socket we must check - if not we don't care less 
1517                  */
1518                  
1519                 if((raw_sk=raw_prot.sock_array[hash])!=NULL)
1520                 {
1521                         struct sock *sknext=NULL;
1522                         struct sk_buff *skb1;
1523                         raw_sk=get_sock_raw(raw_sk, hash,  iph->saddr, iph->daddr);
1524                         if(raw_sk)      /* Any raw sockets */
1525                         {
1526                                 do
1527                                 {
1528                                         /* Find the next */
1529                                         sknext=get_sock_raw(raw_sk->next, hash, iph->saddr, iph->daddr);
1530                                         if(sknext)
1531                                                 skb1=skb_clone(skb, GFP_ATOMIC);
1532                                         else
1533                                                 break;  /* One pending raw socket left */
1534                                         if(skb1)
1535                                                 raw_rcv(raw_sk, skb1, dev, iph->saddr,iph->daddr);
1536                                         raw_sk=sknext;
1537                                 }
1538                                 while(raw_sk!=NULL);
1539                                 
1540                                 /*
1541                                  *      Here either raw_sk is the last raw socket, or NULL if none 
1542                                  */
1543                                  
1544                                 /*
1545                                  *      We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy 
1546                                  */
1547                         }
1548                 }
1549         
1550                 /*
1551                  *      skb->h.raw now points at the protocol beyond the IP header.
1552                  */
1553         
1554                 hash = iph->protocol & (MAX_INET_PROTOS -1);
1555                 for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next)
1556                 {
1557                         struct sk_buff *skb2;
1558         
1559                         if (ipprot->protocol != iph->protocol)
1560                                 continue;
1561                        /*
1562                         *       See if we need to make a copy of it.  This will
1563                         *       only be set if more than one protocol wants it.
1564                         *       and then not for the last one. If there is a pending
1565                         *       raw delivery wait for that
1566                         */
1567         
1568                         if (ipprot->copy || raw_sk)
1569                         {
1570                                 skb2 = skb_clone(skb, GFP_ATOMIC);
1571                                 if(skb2==NULL)
1572                                         continue;
1573                         }
1574                         else
1575                         {
1576                                 skb2 = skb;
1577                         }
1578                         flag = 1;
1579 
1580                        /*
1581                         *       Pass on the datagram to each protocol that wants it,
1582                         *       based on the datagram protocol.  We should really
1583                         *       check the protocol handler's return values here...
1584                         */
1585 
1586                         ipprot->handler(skb2, dev, NULL, iph->daddr,
1587                                 (ntohs(iph->tot_len) - (iph->ihl * 4)),
1588                                 iph->saddr, 0, ipprot);
1589 
1590                 }
1591 
1592                 /*
1593                  *      All protocols checked.
1594                  *      If this packet was a broadcast, we may *not* reply to it, since that
1595                  *      causes (proven, grin) ARP storms and a leakage of memory (i.e. all
1596                  *      ICMP reply messages get queued up for transmission...)
1597                  */
1598 
1599                 if(raw_sk!=NULL)        /* Shift to last raw user */
1600                         raw_rcv(raw_sk, skb, dev, iph->saddr, iph->daddr);
1601                 else if (!flag)         /* Free and report errors */
1602                 {
1603                         if (brd != IS_BROADCAST && brd!=IS_MULTICAST)
1604                                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev);   
1605                         kfree_skb(skb, FREE_WRITE);
1606                 }
1607 
1608                 return(0);
1609         }
1610 
1611         /*
1612          *      Do any IP forwarding required.  chk_addr() is expensive -- avoid it someday.
1613          *
1614          *      This is inefficient. While finding out if it is for us we could also compute
1615          *      the routing table entry. This is where the great unified cache theory comes
1616          *      in as and when someone implements it
1617          *
1618          *      For most hosts over 99% of packets match the first conditional
1619          *      and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at
1620          *      function entry.
1621          */
1622         
1623         /*
1624          *      Don't forward multicast or broadcast frames.
1625          */
1626 
1627         if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST)
1628         {
1629                 kfree_skb(skb,FREE_WRITE);
1630                 return 0;
1631         }
1632 
1633         /*
1634          *      The packet is for another target. Forward the frame
1635          */
1636 
1637 #ifdef CONFIG_IP_FORWARD
1638         ip_forward(skb, dev, is_frag, target_addr, target_strict);
1639 #else
1640 /*      printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n",
1641                         iph->saddr,iph->daddr);*/
1642         ip_statistics.IpInAddrErrors++;
1643 #endif
1644         /*
1645          *      The forwarder is inefficient and copies the packet. We
1646          *      free the original now.
1647          */
1648 
1649         kfree_skb(skb, FREE_WRITE);
1650         return(0);
1651 }
1652         
1653 
1654 /*
1655  *      Loop a packet back to the sender.
1656  */
1657  
1658 static void ip_loopback(struct device *old_dev, struct sk_buff *skb)
     /*  */
1659 {
1660         extern struct device loopback_dev;
1661         struct device *dev=&loopback_dev;
1662         int len=skb->len-old_dev->hard_header_len;
1663         struct sk_buff *newskb=dev_alloc_skb(len+dev->hard_header_len+15);
1664         
1665         if(newskb==NULL)
1666                 return;
1667                 
1668         newskb->link3=NULL;
1669         newskb->sk=NULL;
1670         newskb->dev=dev;
1671         newskb->saddr=skb->saddr;
1672         newskb->daddr=skb->daddr;
1673         newskb->raddr=skb->raddr;
1674         newskb->free=1;
1675         newskb->lock=0;
1676         newskb->users=0;
1677         newskb->pkt_type=skb->pkt_type;
1678         
1679         /*
1680          *      Put a MAC header on the packet
1681          */
1682         ip_send(newskb, skb->ip_hdr->daddr, len, dev, skb->ip_hdr->saddr);
1683         /*
1684          *      Add the rest of the data space. 
1685          */
1686         newskb->ip_hdr=(struct iphdr *)skb_put(skb, len);
1687         /*
1688          *      Copy the data
1689          */
1690         memcpy(newskb->ip_hdr,skb->ip_hdr,len);
1691 
1692         /* Recurse. The device check against IFF_LOOPBACK will stop infinite recursion */
1693                 
1694         /*printk("Loopback output queued [%lX to %lX].\n", newskb->ip_hdr->saddr,newskb->ip_hdr->daddr);*/
1695         ip_queue_xmit(NULL, dev, newskb, 1);
1696 }
1697 
1698 
1699 /*
1700  * Queues a packet to be sent, and starts the transmitter
1701  * if necessary.  if free = 1 then we free the block after
1702  * transmit, otherwise we don't. If free==2 we not only
1703  * free the block but also don't assign a new ip seq number.
1704  * This routine also needs to put in the total length,
1705  * and compute the checksum
1706  */
1707 
1708 void ip_queue_xmit(struct sock *sk, struct device *dev,
     /*  */
1709               struct sk_buff *skb, int free)
1710 {
1711         struct iphdr *iph;
1712         unsigned char *ptr;
1713 
1714         /* Sanity check */
1715         if (dev == NULL)
1716         {
1717                 NETDEBUG(printk("IP: ip_queue_xmit dev = NULL\n"));
1718                 return;
1719         }
1720 
1721         IS_SKB(skb);
1722 
1723         /*
1724          *      Do some book-keeping in the packet for later
1725          */
1726 
1727 
1728         skb->dev = dev;
1729         skb->when = jiffies;
1730 
1731         /*
1732          *      Find the IP header and set the length. This is bad
1733          *      but once we get the skb data handling code in the
1734          *      hardware will push its header sensibly and we will
1735          *      set skb->ip_hdr to avoid this mess and the fixed
1736          *      header length problem
1737          */
1738 
1739         ptr = skb->data;
1740         ptr += dev->hard_header_len;
1741         iph = (struct iphdr *)ptr;
1742         skb->ip_hdr = iph;
1743         iph->tot_len = ntohs(skb->len-dev->hard_header_len);
1744 
1745 #ifdef CONFIG_IP_FIREWALL
1746         if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)
1747                 /* just don't send this packet */
1748                 return;
1749 #endif  
1750 
1751         /*
1752          *      No reassigning numbers to fragments...
1753          */
1754 
1755         if(free!=2)
1756                 iph->id      = htons(ip_id_count++);
1757         else
1758                 free=1;
1759 
1760         /* All buffers without an owner socket get freed */
1761         if (sk == NULL)
1762                 free = 1;
1763 
1764         skb->free = free;
1765 
1766         /*
1767          *      Do we need to fragment. Again this is inefficient.
1768          *      We need to somehow lock the original buffer and use
1769          *      bits of it.
1770          */
1771 
1772         if(skb->len > dev->mtu + dev->hard_header_len)
1773         {
1774                 ip_fragment(sk,skb,dev,0);
1775                 IS_SKB(skb);
1776                 kfree_skb(skb,FREE_WRITE);
1777                 return;
1778         }
1779 
1780         /*
1781          *      Add an IP checksum
1782          */
1783 
1784         ip_send_check(iph);
1785 
1786         /*
1787          *      Print the frame when debugging
1788          */
1789 
1790         /*
1791          *      More debugging. You cannot queue a packet already on a list
1792          *      Spot this and moan loudly.
1793          */
1794         if (skb->next != NULL)
1795         {
1796                 NETDEBUG(printk("ip_queue_xmit: next != NULL\n"));
1797                 skb_unlink(skb);
1798         }
1799 
1800         /*
1801          *      If a sender wishes the packet to remain unfreed
1802          *      we add it to his send queue. This arguably belongs
1803          *      in the TCP level since nobody else uses it. BUT
1804          *      remember IPng might change all the rules.
1805          */
1806 
1807         if (!free)
1808         {
1809                 unsigned long flags;
1810                 /* The socket now has more outstanding blocks */
1811 
1812                 sk->packets_out++;
1813 
1814                 /* Protect the list for a moment */
1815                 save_flags(flags);
1816                 cli();
1817 
1818                 if (skb->link3 != NULL)
1819                 {
1820                         NETDEBUG(printk("ip.c: link3 != NULL\n"));
1821                         skb->link3 = NULL;
1822                 }
1823                 if (sk->send_head == NULL)
1824                 {
1825                         sk->send_tail = skb;
1826                         sk->send_head = skb;
1827                 }
1828                 else
1829                 {
1830                         sk->send_tail->link3 = skb;
1831                         sk->send_tail = skb;
1832                 }
1833                 /* skb->link3 is NULL */
1834 
1835                 /* Interrupt restore */
1836                 restore_flags(flags);
1837         }
1838         else
1839                 /* Remember who owns the buffer */
1840                 skb->sk = sk;
1841 
1842         /*
1843          *      If the indicated interface is up and running, send the packet.
1844          */
1845          
1846         ip_statistics.IpOutRequests++;
1847 #ifdef CONFIG_IP_ACCT
1848         ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
1849 #endif  
1850         
1851 #ifdef CONFIG_IP_MULTICAST      
1852 
1853         /*
1854          *      Multicasts are looped back for other local users
1855          */
1856          
1857         if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK))
1858         {
1859                 if(sk==NULL || sk->ip_mc_loop)
1860                 {
1861                         if(iph->daddr==IGMP_ALL_HOSTS)
1862                                 ip_loopback(dev,skb);
1863                         else
1864                         {
1865                                 struct ip_mc_list *imc=dev->ip_mc_list;
1866                                 while(imc!=NULL)
1867                                 {
1868                                         if(imc->multiaddr==iph->daddr)
1869                                         {
1870                                                 ip_loopback(dev,skb);
1871                                                 break;
1872                                         }
1873                                         imc=imc->next;
1874                                 }
1875                         }
1876                 }
1877                 /* Multicasts with ttl 0 must not go beyond the host */
1878                 
1879                 if(skb->ip_hdr->ttl==0)
1880                 {
1881                         kfree_skb(skb, FREE_READ);
1882                         return;
1883                 }
1884         }
1885 #endif
1886         if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))
1887                 ip_loopback(dev,skb);
1888                 
1889         if (dev->flags & IFF_UP)
1890         {
1891                 /*
1892                  *      If we have an owner use its priority setting,
1893                  *      otherwise use NORMAL
1894                  */
1895 
1896                 if (sk != NULL)
1897                 {
1898                         dev_queue_xmit(skb, dev, sk->priority);
1899                 }
1900                 else
1901                 {
1902                         dev_queue_xmit(skb, dev, SOPRI_NORMAL);
1903                 }
1904         }
1905         else
1906         {
1907                 ip_statistics.IpOutDiscards++;
1908                 if (free)
1909                         kfree_skb(skb, FREE_WRITE);
1910         }
1911 }
1912 
1913 
1914 
1915 #ifdef CONFIG_IP_MULTICAST
1916 
1917 /*
1918  *      Write an multicast group list table for the IGMP daemon to
1919  *      read.
1920  */
1921  
1922 int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length)
     /*  */
1923 {
1924         off_t pos=0, begin=0;
1925         struct ip_mc_list *im;
1926         unsigned long flags;
1927         int len=0;
1928         struct device *dev;
1929         
1930         len=sprintf(buffer,"Device    : Count\tGroup    Users Timer\n");  
1931         save_flags(flags);
1932         cli();
1933         
1934         for(dev = dev_base; dev; dev = dev->next)
1935         {
1936                 if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST))
1937                 {
1938                         len+=sprintf(buffer+len,"%-10s: %5d\n",
1939                                         dev->name, dev->mc_count);
1940                         for(im = dev->ip_mc_list; im; im = im->next)
1941                         {
1942                                 len+=sprintf(buffer+len,
1943                                         "\t\t\t%08lX %5d %d:%08lX\n",
1944                                         im->multiaddr, im->users,
1945                                         im->tm_running, im->timer.expires);
1946                                 pos=begin+len;
1947                                 if(pos<offset)
1948                                 {
1949                                         len=0;
1950                                         begin=pos;
1951                                 }
1952                                 if(pos>offset+length)
1953                                         break;
1954                         }
1955                 }
1956         }
1957         restore_flags(flags);
1958         *start=buffer+(offset-begin);
1959         len-=(offset-begin);
1960         if(len>length)
1961                 len=length;     
1962         return len;
1963 }
1964 
1965 
1966 #endif  
1967 /*
1968  *      Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
1969  *      an IP socket.
1970  *
1971  *      We implement IP_TOS (type of service), IP_TTL (time to live).
1972  *
1973  *      Next release we will sort out IP_OPTIONS since for some people are kind of important.
1974  */
1975 
1976 static struct device *ip_mc_find_devfor(unsigned long addr)
     /*  */
1977 {
1978         struct device *dev;
1979         for(dev = dev_base; dev; dev = dev->next)
1980         {
1981                 if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST)&&
1982                         (dev->pa_addr==addr))
1983                         return dev;
1984         }
1985 
1986         return NULL;
1987 }
1988 
1989 int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
     /*  */
1990 {
1991         int val,err;
1992         unsigned char ucval;
1993 #if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT)
1994         struct ip_fw tmp_fw;
1995 #endif  
1996         if (optval == NULL)
1997                 return(-EINVAL);
1998 
1999         err=verify_area(VERIFY_READ, optval, sizeof(int));
2000         if(err)
2001                 return err;
2002 
2003         val = get_user((int *) optval);
2004         ucval=get_user((unsigned char *) optval);
2005 
2006         if(level!=SOL_IP)
2007                 return -EOPNOTSUPP;
2008 
2009         switch(optname)
2010         {
2011                 case IP_TOS:
2012                         if(val<0||val>255)
2013                                 return -EINVAL;
2014                         sk->ip_tos=val;
2015                         if(val==IPTOS_LOWDELAY)
2016                                 sk->priority=SOPRI_INTERACTIVE;
2017                         if(val==IPTOS_THROUGHPUT)
2018                                 sk->priority=SOPRI_BACKGROUND;
2019                         return 0;
2020                 case IP_TTL:
2021                         if(val<1||val>255)
2022                                 return -EINVAL;
2023                         sk->ip_ttl=val;
2024                         return 0;
2025 #ifdef CONFIG_IP_MULTICAST
2026                 case IP_MULTICAST_TTL: 
2027                 {
2028                         sk->ip_mc_ttl=(int)ucval;
2029                         return 0;
2030                 }
2031                 case IP_MULTICAST_LOOP: 
2032                 {
2033                         if(ucval!=0 && ucval!=1)
2034                                  return -EINVAL;
2035                         sk->ip_mc_loop=(int)ucval;
2036                         return 0;
2037                 }
2038                 case IP_MULTICAST_IF: 
2039                 {
2040                         struct in_addr addr;
2041                         struct device *dev=NULL;
2042                         
2043                         /*
2044                          *      Check the arguments are allowable
2045                          */
2046 
2047                         err=verify_area(VERIFY_READ, optval, sizeof(addr));
2048                         if(err)
2049                                 return err;
2050                                 
2051                         memcpy_fromfs(&addr,optval,sizeof(addr));
2052                         
2053                         
2054                         /*
2055                          *      What address has been requested
2056                          */
2057                         
2058                         if(addr.s_addr==INADDR_ANY)     /* Default */
2059                         {
2060                                 sk->ip_mc_name[0]=0;
2061                                 return 0;
2062                         }
2063                         
2064                         /*
2065                          *      Find the device
2066                          */
2067                          
2068                         dev=ip_mc_find_devfor(addr.s_addr);
2069                                                 
2070                         /*
2071                          *      Did we find one
2072                          */
2073                          
2074                         if(dev) 
2075                         {
2076                                 strcpy(sk->ip_mc_name,dev->name);
2077                                 return 0;
2078                         }
2079                         return -EADDRNOTAVAIL;
2080                 }
2081                 
2082                 case IP_ADD_MEMBERSHIP: 
2083                 {
2084                 
2085 /*
2086  *      FIXME: Add/Del membership should have a semaphore protecting them from re-entry
2087  */
2088                         struct ip_mreq mreq;
2089                         unsigned long route_src;
2090                         struct rtable *rt;
2091                         struct device *dev=NULL;
2092                         
2093                         /*
2094                          *      Check the arguments.
2095                          */
2096 
2097                         err=verify_area(VERIFY_READ, optval, sizeof(mreq));
2098                         if(err)
2099                                 return err;
2100 
2101                         memcpy_fromfs(&mreq,optval,sizeof(mreq));
2102 
2103                         /* 
2104                          *      Get device for use later
2105                          */
2106 
2107                         if(mreq.imr_interface.s_addr==INADDR_ANY) 
2108                         {
2109                                 /*
2110                                  *      Not set so scan.
2111                                  */
2112                                 if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,NULL, &route_src))!=NULL)
2113                                 {
2114                                         dev=rt->rt_dev;
2115                                         rt->rt_use--;
2116                                 }
2117                         }
2118                         else
2119                         {
2120                                 /*
2121                                  *      Find a suitable device.
2122                                  */
2123                                 
2124                                 dev=ip_mc_find_devfor(mreq.imr_interface.s_addr);
2125                         }
2126                         
2127                         /*
2128                          *      No device, no cookies.
2129                          */
2130                          
2131                         if(!dev)
2132                                 return -ENODEV;
2133                                 
2134                         /*
2135                          *      Join group.
2136                          */
2137                          
2138                         return ip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr);
2139                 }
2140                 
2141                 case IP_DROP_MEMBERSHIP: 
2142                 {
2143                         struct ip_mreq mreq;
2144                         struct rtable *rt;
2145                         unsigned long route_src;
2146                         struct device *dev=NULL;
2147 
2148                         /*
2149                          *      Check the arguments
2150                          */
2151                          
2152                         err=verify_area(VERIFY_READ, optval, sizeof(mreq));
2153                         if(err)
2154                                 return err;
2155 
2156                         memcpy_fromfs(&mreq,optval,sizeof(mreq));
2157 
2158                         /*
2159                          *      Get device for use later 
2160                          */
2161  
2162                         if(mreq.imr_interface.s_addr==INADDR_ANY) 
2163                         {
2164                                 if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,NULL, &route_src))!=NULL)
2165                                 {
2166                                         dev=rt->rt_dev;
2167                                         rt->rt_use--;
2168                                 }
2169                         }
2170                         else 
2171                         {
2172                         
2173                                 dev=ip_mc_find_devfor(mreq.imr_interface.s_addr);
2174                         }
2175                         
2176                         /*
2177                          *      Did we find a suitable device.
2178                          */
2179                          
2180                         if(!dev)
2181                                 return -ENODEV;
2182                                 
2183                         /*
2184                          *      Leave group
2185                          */
2186                          
2187                         return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr);
2188                 }
2189 #endif                  
2190 #ifdef CONFIG_IP_FIREWALL
2191                 case IP_FW_ADD_BLK:
2192                 case IP_FW_DEL_BLK:
2193                 case IP_FW_ADD_FWD:
2194                 case IP_FW_DEL_FWD:
2195                 case IP_FW_CHK_BLK:
2196                 case IP_FW_CHK_FWD:
2197                 case IP_FW_FLUSH_BLK:
2198                 case IP_FW_FLUSH_FWD:
2199                 case IP_FW_ZERO_BLK:
2200                 case IP_FW_ZERO_FWD:
2201                 case IP_FW_POLICY_BLK:
2202                 case IP_FW_POLICY_FWD:
2203                         if(!suser())
2204                                 return -EPERM;
2205                         if(optlen>sizeof(tmp_fw) || optlen<1)
2206                                 return -EINVAL;
2207                         err=verify_area(VERIFY_READ,optval,optlen);
2208                         if(err)
2209                                 return err;
2210                         memcpy_fromfs(&tmp_fw,optval,optlen);
2211                         err=ip_fw_ctl(optname, &tmp_fw,optlen);
2212                         return -err;    /* -0 is 0 after all */
2213                         
2214 #endif
2215 #ifdef CONFIG_IP_ACCT
2216                 case IP_ACCT_DEL:
2217                 case IP_ACCT_ADD:
2218                 case IP_ACCT_FLUSH:
2219                 case IP_ACCT_ZERO:
2220                         if(!suser())
2221                                 return -EPERM;
2222                         if(optlen>sizeof(tmp_fw) || optlen<1)
2223                                 return -EINVAL;
2224                         err=verify_area(VERIFY_READ,optval,optlen);
2225                         if(err)
2226                                 return err;
2227                         memcpy_fromfs(&tmp_fw, optval,optlen);
2228                         err=ip_acct_ctl(optname, &tmp_fw,optlen);
2229                         return -err;    /* -0 is 0 after all */
2230 #endif
2231                 /* IP_OPTIONS and friends go here eventually */
2232                 default:
2233                         return(-ENOPROTOOPT);
2234         }
2235 }
2236 
2237 /*
2238  *      Get the options. Note for future reference. The GET of IP options gets the
2239  *      _received_ ones. The set sets the _sent_ ones.
2240  */
2241 
2242 int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
     /*  */
2243 {
2244         int val,err;
2245 #ifdef CONFIG_IP_MULTICAST
2246         int len;
2247 #endif
2248         
2249         if(level!=SOL_IP)
2250                 return -EOPNOTSUPP;
2251 
2252         switch(optname)
2253         {
2254                 case IP_TOS:
2255                         val=sk->ip_tos;
2256                         break;
2257                 case IP_TTL:
2258                         val=sk->ip_ttl;
2259                         break;
2260 #ifdef CONFIG_IP_MULTICAST                      
2261                 case IP_MULTICAST_TTL:
2262                         val=sk->ip_mc_ttl;
2263                         break;
2264                 case IP_MULTICAST_LOOP:
2265                         val=sk->ip_mc_loop;
2266                         break;
2267                 case IP_MULTICAST_IF:
2268                         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2269                         if(err)
2270                                 return err;
2271                         len=strlen(sk->ip_mc_name);
2272                         err=verify_area(VERIFY_WRITE, optval, len);
2273                         if(err)
2274                                 return err;
2275                         put_user(len,(int *) optlen);
2276                         memcpy_tofs((void *)optval,sk->ip_mc_name, len);
2277                         return 0;
2278 #endif
2279                 default:
2280                         return(-ENOPROTOOPT);
2281         }
2282         err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
2283         if(err)
2284                 return err;
2285         put_user(sizeof(int),(int *) optlen);
2286 
2287         err=verify_area(VERIFY_WRITE, optval, sizeof(int));
2288         if(err)
2289                 return err;
2290         put_user(val,(int *) optval);
2291 
2292         return(0);
2293 }
2294 
2295 /*
2296  *      Build and send a packet, with as little as one copy
2297  *
2298  *      Doesn't care much about ip options... option length can be
2299  *      different for fragment at 0 and other fragments.
2300  *
2301  *      Note that the fragment at the highest offset is sent first,
2302  *      so the getfrag routine can fill in the TCP/UDP checksum header
2303  *      field in the last fragment it sends... actually it also helps
2304  *      the reassemblers, they can put most packets in at the head of
2305  *      the fragment queue, and they know the total size in advance. This
2306  *      last feature will measurable improve the Linux fragment handler.
2307  *
2308  *      The callback has five args, an arbitrary pointer (copy of frag),
2309  *      the source IP address (may depend on the routing table), the 
2310  *      destination adddress (char *), the offset to copy from, and the
2311  *      length to be copied.
2312  * 
2313  */
2314 
2315 int ip_build_xmit(struct sock *sk,
     /*  */
2316                    void getfrag (void *,
2317                                  int,
2318                                  char *,
2319                                  unsigned int,
2320                                  unsigned int),
2321                    void *frag,
2322                    unsigned short int length,
2323                    int daddr,
2324                    int flags,
2325                    int type) 
2326 {
2327         struct rtable *rt;
2328         unsigned int fraglen, maxfraglen, fragheaderlen;
2329         int offset, mf;
2330         unsigned long saddr;
2331         unsigned short id;
2332         struct iphdr *iph;
2333         int local=0;
2334         struct device *dev;
2335 
2336 
2337 #ifdef CONFIG_INET_MULTICAST    
2338         if(sk && MULTICAST(daddr) && *sk->ip_mc_name)
2339         {
2340                 dev=dev_get(skb->ip_mc_name);
2341                 if(!dev)
2342                         return -ENODEV;
2343                 rt=NULL;
2344         }
2345         else
2346         {
2347 #endif  
2348                 /*
2349                  *      Perform the IP routing decisions
2350                  */
2351          
2352                 if(sk->localroute || flags&MSG_DONTROUTE)
2353                         local=1;
2354         
2355                 rt = sk->ip_route_cache;
2356                 
2357                 /*
2358                  *      See if the routing cache is outdated. We need to clean this up once we are happy it is reliable
2359                  *      by doing the invalidation actively in the route change and header change.
2360                  */
2361         
2362                 saddr=sk->ip_route_saddr;        
2363                 if(!rt || sk->ip_route_stamp != rt_stamp || daddr!=sk->ip_route_daddr || sk->ip_route_local!=local || sk->saddr!=sk->ip_route_saddr)
2364                 {
2365                         if(local)
2366                                 rt = ip_rt_local(daddr, NULL, &saddr);
2367                         else
2368                                 rt = ip_rt_route(daddr, NULL, &saddr);
2369                         sk->ip_route_local=local;
2370                         sk->ip_route_daddr=daddr;
2371                         sk->ip_route_saddr=saddr;
2372                         sk->ip_route_stamp=rt_stamp;
2373                         sk->ip_route_cache=rt;
2374                         sk->ip_hcache_ver=NULL;
2375                         sk->ip_hcache_state= 0;
2376                 }
2377                 else if(rt)
2378                 {
2379                         /*
2380                          *      Attempt header caches only if the cached route is being reused. Header cache
2381                          *      is not ultra cheap to set up. This means we only set it up on the second packet,
2382                          *      so one shot communications are not slowed. We assume (seems reasonable) that 2 is
2383                          *      probably going to be a stream of data.
2384                          */
2385                         if(rt->rt_dev->header_cache && sk->ip_hcache_state!= -1)
2386                         {
2387                                 if(sk->ip_hcache_ver==NULL || sk->ip_hcache_stamp!=*sk->ip_hcache_ver)
2388                                         rt->rt_dev->header_cache(rt->rt_dev,sk,saddr,daddr);
2389                                 else
2390                                         /* Can't cache. Remember this */
2391                                         sk->ip_hcache_state= -1;
2392                         }
2393                 }
2394                 
2395                 if (rt == NULL) 
2396                 {
2397                         ip_statistics.IpOutNoRoutes++;
2398                         return(-ENETUNREACH);
2399                 }
2400         
2401                 if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr)))
2402                         saddr = sk->saddr;
2403                         
2404                 dev=rt->rt_dev;
2405 #ifdef CONFIG_INET_MULTICAST
2406         }
2407 #endif          
2408 
2409         /*
2410          *      Now compute the buffer space we require
2411          */ 
2412 
2413         fragheaderlen = dev->hard_header_len;
2414         if(type != IPPROTO_RAW)
2415                 fragheaderlen += 20;
2416                 
2417         /*
2418          *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
2419          *      out the size of the frames to send.
2420          */
2421          
2422         maxfraglen = ((dev->mtu-20) & ~7) + fragheaderlen;
2423         
2424         /*
2425          *      Start at the end of the frame by handling the remainder.
2426          */
2427          
2428         offset = length - (length % (maxfraglen - fragheaderlen));
2429         
2430         /*
2431          *      Amount of memory to allocate for final fragment.
2432          */
2433          
2434         fraglen = length - offset + fragheaderlen;
2435         
2436         if(fraglen==0)
2437         {
2438                 fraglen = maxfraglen;
2439                 offset -= maxfraglen-fragheaderlen;
2440         }
2441         
2442         
2443         /*
2444          *      The last fragment will not have MF (more fragments) set.
2445          */
2446          
2447         mf = 0;
2448 
2449         /*
2450          *      Can't fragment raw packets 
2451          */
2452          
2453         if (type == IPPROTO_RAW && offset > 0)
2454                 return(-EMSGSIZE);
2455 
2456         /*
2457          *      Get an identifier
2458          */
2459          
2460         id = htons(ip_id_count++);
2461 
2462         /*
2463          *      Being outputting the bytes.
2464          */
2465          
2466         do 
2467         {
2468                 struct sk_buff * skb;
2469                 int error;
2470                 char *data;
2471 
2472                 /*
2473                  *      Get the memory we require with some space left for alignment.
2474                  */
2475 
2476                 skb = sock_alloc_send_skb(sk, fraglen+15, 0, &error);
2477                 if (skb == NULL)
2478                         return(error);
2479 
2480                 /*
2481                  *      Fill in the control structures
2482                  */
2483                  
2484                 skb->next = skb->prev = NULL;
2485                 skb->dev = dev;
2486                 skb->when = jiffies;
2487                 skb->free = 1; /* dubious, this one */
2488                 skb->sk = sk;
2489                 skb->arp = 0;
2490                 skb->saddr = saddr;
2491                 skb->raddr = (rt&&rt->rt_gateway) ? rt->rt_gateway : daddr;
2492                 skb_reserve(skb,(dev->hard_header_len+15)&~15);
2493                 data = skb_put(skb, fraglen-dev->hard_header_len);
2494 
2495                 /*
2496                  *      Save us ARP and stuff. In the optimal case we do no route lookup (route cache ok)
2497                  *      no ARP lookup (arp cache ok) and output. The cache checks are still too slow but
2498                  *      this can be fixed later. For gateway routes we ought to have a rt->.. header cache
2499                  *      pointer to speed header cache builds for identical targets.
2500                  */
2501                  
2502                 if(sk->ip_hcache_state>0)
2503                 {
2504                         memcpy(skb->data,sk->ip_hcache_data, dev->hard_header_len);
2505                         skb->arp=1;
2506                 }
2507                 else if (dev->hard_header)
2508                 {
2509                         if(dev->hard_header(skb, dev, ETH_P_IP, 
2510                                                 NULL, NULL, 0)>0)
2511                                 skb->arp=1;
2512                 }
2513                 
2514                 /*
2515                  *      Find where to start putting bytes.
2516                  */
2517                  
2518                 iph = (struct iphdr *)data;
2519 
2520                 /*
2521                  *      Only write IP header onto non-raw packets 
2522                  */
2523                  
2524                 if(type != IPPROTO_RAW) 
2525                 {
2526 
2527                         iph->version = 4;
2528                         iph->ihl = 5; /* ugh */
2529                         iph->tos = sk->ip_tos;
2530                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
2531                         iph->id = id;
2532                         iph->frag_off = htons(offset>>3);
2533                         iph->frag_off |= mf;
2534 #ifdef CONFIG_IP_MULTICAST
2535                         if (MULTICAST(daddr))
2536                                 iph->ttl = sk->ip_mc_ttl;
2537                         else
2538 #endif
2539                                 iph->ttl = sk->ip_ttl;
2540                         iph->protocol = type;
2541                         iph->check = 0;
2542                         iph->saddr = saddr;
2543                         iph->daddr = daddr;
2544                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
2545                         data += iph->ihl*4;
2546                         
2547                         /*
2548                          *      Any further fragments will have MF set.
2549                          */
2550                          
2551                         mf = htons(IP_MF);
2552                 }
2553                 
2554                 /*
2555                  *      User data callback
2556                  */
2557 
2558                 getfrag(frag, saddr, data, offset, fraglen-fragheaderlen);
2559                 
2560                 /*
2561                  *      Account for the fragment.
2562                  */
2563                  
2564 #ifdef CONFIG_IP_ACCT
2565                 if(!offset)
2566                         ip_fw_chk(iph, dev, ip_acct_chain, IP_FW_F_ACCEPT, 1);
2567 #endif  
2568                 offset -= (maxfraglen-fragheaderlen);
2569                 fraglen = maxfraglen;
2570 
2571 #ifdef CONFIG_IP_MULTICAST
2572 
2573                 /*
2574                  *      Multicasts are looped back for other local users
2575                  */
2576          
2577                 if (MULTICAST(daddr) && !(dev->flags&IFF_LOOPBACK)) 
2578                 {
2579                         /*
2580                          *      Loop back any frames. The check for IGMP_ALL_HOSTS is because
2581                          *      you are always magically a member of this group.
2582                          */
2583                          
2584                         if(sk==NULL || sk->ip_mc_loop) 
2585                         {
2586                                 if(skb->daddr==IGMP_ALL_HOSTS)
2587                                         ip_loopback(rt->rt_dev,skb);
2588                                 else 
2589                                 {
2590                                         struct ip_mc_list *imc=rt->rt_dev->ip_mc_list;
2591                                         while(imc!=NULL) 
2592                                         {
2593                                                 if(imc->multiaddr==daddr) 
2594                                                 {
2595                                                         ip_loopback(rt->rt_dev,skb);
2596                                                         break;
2597                                                 }
2598                                                 imc=imc->next;
2599                                         }
2600                                 }
2601                         }
2602 
2603                         /*
2604                          *      Multicasts with ttl 0 must not go beyond the host. Fixme: avoid the
2605                          *      extra clone.
2606                          */
2607 
2608                         if(skb->ip_hdr->ttl==0)
2609                                 kfree_skb(skb, FREE_READ);
2610                 }
2611 #endif
2612                 /*
2613                  *      Now queue the bytes into the device.
2614                  */
2615                  
2616                 if (dev->flags & IFF_UP) 
2617                 {
2618                         dev_queue_xmit(skb, dev, sk->priority);
2619                 } 
2620                 else 
2621                 {
2622                         /*
2623                          *      Whoops... 
2624                          *
2625                          *      FIXME:  There is a small nasty here. During the ip_build_xmit we could
2626                          *      page fault between the route lookup and device send, the device might be
2627                          *      removed and unloaded.... We need to add device locks on this.
2628                          */
2629                          
2630                         ip_statistics.IpOutDiscards++;
2631                         kfree_skb(skb, FREE_WRITE);
2632                         return(0); /* lose rest of fragments */
2633                 }
2634         } 
2635         while (offset >= 0);
2636         
2637         return(0);
2638 }
2639     
2640 
2641 /*
2642  *      IP protocol layer initialiser
2643  */
2644 
2645 static struct packet_type ip_packet_type =
2646 {
2647         0,      /* MUTTER ntohs(ETH_P_IP),*/
2648         NULL,   /* All devices */
2649         ip_rcv,
2650         NULL,
2651         NULL,
2652 };
2653 
2654 /*
2655  *      Device notifier
2656  */
2657  
2658 static int ip_rt_event(unsigned long event, void *ptr)
     /*  */
2659 {
2660         if(event==NETDEV_DOWN)
2661                 ip_rt_flush(ptr);
2662         return NOTIFY_DONE;
2663 }
2664 
2665 struct notifier_block ip_rt_notifier={
2666         ip_rt_event,
2667         NULL,
2668         0
2669 };
2670 
2671 /*
2672  *      IP registers the packet type and then calls the subprotocol initialisers
2673  */
2674 
2675 void ip_init(void)
     /*  */
2676 {
2677         ip_packet_type.type=htons(ETH_P_IP);
2678         dev_add_pack(&ip_packet_type);
2679 
2680         /* So we flush routes when a device is downed */        
2681         register_netdevice_notifier(&ip_rt_notifier);
2682 /*      ip_raw_init();
2683         ip_packet_init();
2684         ip_tcp_init();
2685         ip_udp_init();*/
2686 }
2687
/* */
root/net/ipv4/ip.c

DEFINITIONS