1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * The Internet Protocol (IP) module. 7 * 8 * Version: @(#)ip.c 1.0.16b 9/1/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Donald Becker, <becker@super.org> 13 * Alan Cox, <Alan.Cox@linux.org> 14 * Richard Underwood 15 * Stefan Becker, <stefanb@yello.ping.de> 16 * Jorge Cwik, <jorge@laser.satlink.net> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 18 * 19 * 20 * Fixes: 21 * Alan Cox : Commented a couple of minor bits of surplus code 22 * Alan Cox : Undefining IP_FORWARD doesn't include the code 23 * (just stops a compiler warning). 24 * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes 25 * are junked rather than corrupting things. 26 * Alan Cox : Frames to bad broadcast subnets are dumped 27 * We used to process them non broadcast and 28 * boy could that cause havoc. 29 * Alan Cox : ip_forward sets the free flag on the 30 * new frame it queues. Still crap because 31 * it copies the frame but at least it 32 * doesn't eat memory too. 33 * Alan Cox : Generic queue code and memory fixes. 34 * Fred Van Kempen : IP fragment support (borrowed from NET2E) 35 * Gerhard Koerting: Forward fragmented frames correctly. 36 * Gerhard Koerting: Fixes to my fix of the above 8-). 37 * Gerhard Koerting: IP interface addressing fix. 38 * Linus Torvalds : More robustness checks 39 * Alan Cox : Even more checks: Still not as robust as it ought to be 40 * Alan Cox : Save IP header pointer for later 41 * Alan Cox : ip option setting 42 * Alan Cox : Use ip_tos/ip_ttl settings 43 * Alan Cox : Fragmentation bogosity removed 44 * (Thanks to Mark.Bush@prg.ox.ac.uk) 45 * Dmitry Gorodchanin : Send of a raw packet crash fix. 46 * Alan Cox : Silly ip bug when an overlength 47 * fragment turns up. Now frees the 48 * queue. 49 * Linus Torvalds/ : Memory leakage on fragmentation 50 * Alan Cox : handling. 51 * Gerhard Koerting: Forwarding uses IP priority hints 52 * Teemu Rantanen : Fragment problems. 53 * Alan Cox : General cleanup, comments and reformat 54 * Alan Cox : SNMP statistics 55 * Alan Cox : BSD address rule semantics. Also see 56 * UDP as there is a nasty checksum issue 57 * if you do things the wrong way. 58 * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file 59 * Alan Cox : IP options adjust sk->priority. 60 * Pedro Roque : Fix mtu/length error in ip_forward. 61 * Alan Cox : Avoid ip_chk_addr when possible. 62 * Richard Underwood : IP multicasting. 63 * Alan Cox : Cleaned up multicast handlers. 64 * Alan Cox : RAW sockets demultiplex in the BSD style. 65 * Gunther Mayer : Fix the SNMP reporting typo 66 * Alan Cox : Always in group 224.0.0.1 67 * Pauline Middelink : Fast ip_checksum update when forwarding 68 * Masquerading support. 69 * Alan Cox : Multicast loopback error for 224.0.0.1 70 * Alan Cox : IP_MULTICAST_LOOP option. 71 * Alan Cox : Use notifiers. 72 * Bjorn Ekwall : Removed ip_csum (from slhc.c too) 73 * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) 74 * Stefan Becker : Send out ICMP HOST REDIRECT 75 * Arnt Gulbrandsen : ip_build_xmit 76 * Alan Cox : Per socket routing cache 77 * Alan Cox : Fixed routing cache, added header cache. 78 * Alan Cox : Loopback didnt work right in original ip_build_xmit - fixed it. 79 * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. 80 * Alan Cox : Incoming IP option handling. 81 * Alan Cox : Set saddr on raw output frames as per BSD. 82 * Alan Cox : Stopped broadcast source route explosions. 83 * Alan Cox : Can disable source routing 84 * Takeshi Sone : Masquerading didn't work. 85 * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. 86 * Alan Cox : Memory leaks, tramples, misc debugging. 87 * Alan Cox : Fixed multicast (by popular demand 8)) 88 * Alan Cox : Fixed forwarding (by even more popular demand 8)) 89 * Alan Cox : Fixed SNMP statistics [I think] 90 * Gerhard Koerting : IP fragmentation forwarding fix 91 * Alan Cox : Device lock against page fault. 92 * Alan Cox : IP_HDRINCL facility. 93 * Werner Almesberger : Zero fragment bug 94 * Alan Cox : RAW IP frame length bug 95 * Alan Cox : Outgoing firewall on build_xmit 96 * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel 97 * Alan Cox : Multicast routing hooks 98 * Jos Vos : Do accounting *before* call_in_firewall 99 * 100 * 101 * 102 * To Fix: 103 * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient 104 * and could be made very efficient with the addition of some virtual memory hacks to permit 105 * the allocation of a buffer that can then be 'grown' by twiddling page tables. 106 * Output fragmentation wants updating along with the buffer management to use a single 107 * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet 108 * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause 109 * fragmentation anyway. 110 * 111 * FIXME: copy frag 0 iph to qp->iph 112 * 113 * This program is free software; you can redistribute it and/or 114 * modify it under the terms of the GNU General Public License 115 * as published by the Free Software Foundation; either version 116 * 2 of the License, or (at your option) any later version. 117 */ 118 119 #include <asm/segment.h> 120 #include <asm/system.h> 121 #include <linux/types.h> 122 #include <linux/kernel.h> 123 #include <linux/sched.h> 124 #include <linux/mm.h> 125 #include <linux/string.h> 126 #include <linux/errno.h> 127 #include <linux/config.h> 128 129 #include <linux/socket.h> 130 #include <linux/sockios.h> 131 #include <linux/in.h> 132 #include <linux/inet.h> 133 #include <linux/netdevice.h> 134 #include <linux/etherdevice.h> 135 #include <linux/proc_fs.h> 136 #include <linux/stat.h> 137 138 #include <net/snmp.h> 139 #include <net/ip.h> 140 #include <net/protocol.h> 141 #include <net/route.h> 142 #include <net/tcp.h> 143 #include <net/udp.h> 144 #include <linux/skbuff.h> 145 #include <net/sock.h> 146 #include <net/arp.h> 147 #include <net/icmp.h> 148 #include <net/raw.h> 149 #include <net/checksum.h> 150 #include <linux/igmp.h> 151 #include <linux/ip_fw.h> 152 #include <linux/firewall.h> 153 #include <linux/mroute.h> 154 #include <net/netlink.h> 155 #ifdef CONFIG_NET_ALIAS 156 #include <linux/net_alias.h> 157 #endif 158 159 extern int last_retran; 160 extern void sort_send(struct sock *sk); 161 162 #define min(a,b) ((a)<(b)?(a):(b)) 163 164 /* 165 * SNMP management statistics 166 */ 167 168 #ifdef CONFIG_IP_FORWARD 169 struct ip_mib ip_statistics={1,64,}; /* Forwarding=Yes, Default TTL=64 */ 170 #else 171 struct ip_mib ip_statistics={2,64,}; /* Forwarding=No, Default TTL=64 */ 172 #endif 173 174 /* 175 * Handle the issuing of an ioctl() request 176 * for the ip device. This is scheduled to 177 * disappear 178 */ 179 180 int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) /* */ 181 { 182 switch(cmd) 183 { 184 default: 185 return(-EINVAL); 186 } 187 } 188 189 190 191 /* 192 * This function receives all incoming IP datagrams. 193 * 194 * On entry skb->data points to the start of the IP header and 195 * the MAC header has been removed. 196 */ 197 198 int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) /* */ 199 { 200 struct iphdr *iph = skb->h.iph; 201 struct sock *raw_sk=NULL; 202 unsigned char hash; 203 unsigned char flag = 0; 204 struct inet_protocol *ipprot; 205 int brd=IS_MYADDR; 206 struct options * opt = NULL; 207 int is_frag=0; 208 __u32 daddr; 209 210 #ifdef CONFIG_FIREWALL 211 int err; 212 #endif 213 #ifdef CONFIG_IP_MROUTE 214 int mroute_pkt=0; 215 #endif 216 217 #ifdef CONFIG_NET_IPV6 218 /* 219 * Intercept IPv6 frames. We dump ST-II and invalid types just below.. 220 */ 221 222 if(iph->version == 6) 223 return ipv6_rcv(skb,dev,pt); 224 #endif 225 226 ip_statistics.IpInReceives++; 227 228 /* 229 * Tag the ip header of this packet so we can find it 230 */ 231 232 skb->ip_hdr = iph; 233 234 /* 235 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. 236 * RFC1122: 3.1.2.3 MUST discard a frame with invalid source address [NEEDS FIXING]. 237 * 238 * Is the datagram acceptable? 239 * 240 * 1. Length at least the size of an ip header 241 * 2. Version of 4 242 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] 243 * 4. Doesn't have a bogus length 244 * (5. We ought to check for IP multicast addresses and undefined types.. does this matter ?) 245 */ 246 247 if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4 || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0 248 || skb->len < ntohs(iph->tot_len)) 249 { 250 ip_statistics.IpInHdrErrors++; 251 kfree_skb(skb, FREE_WRITE); 252 return(0); 253 } 254 255 /* 256 * Our transport medium may have padded the buffer out. Now we know it 257 * is IP we can trim to the true length of the frame. 258 * Note this now means skb->len holds ntohs(iph->tot_len). 259 */ 260 261 skb_trim(skb,ntohs(iph->tot_len)); 262 263 if (iph->ihl > 5) 264 { 265 skb->ip_summed = 0; 266 if (ip_options_compile(NULL, skb)) 267 return(0); 268 opt = (struct options*)skb->proto_priv; 269 #ifdef CONFIG_IP_NOSR 270 if (opt->srr) 271 { 272 kfree_skb(skb, FREE_READ); 273 return -EINVAL; 274 } 275 #endif 276 } 277 278 /* 279 * Try to select closest <src,dst> alias device, if any. 280 * net_alias_dev_rcv_sel32 returns main device if it 281 * fails to found other. 282 */ 283 284 #ifdef CONFIG_NET_ALIAS 285 if (iph->daddr != skb->dev->pa_addr && net_alias_has(skb->dev)) 286 skb->dev = dev = net_alias_dev_rcv_sel32(skb->dev, AF_INET, iph->saddr, iph->daddr); 287 #endif 288 289 /* 290 * Account for the packet (even if the packet is 291 * not accepted by the firewall!). 292 */ 293 294 #ifdef CONFIG_IP_ACCT 295 ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1); 296 #endif 297 298 /* 299 * See if the firewall wants to dispose of the packet. 300 */ 301 302 #ifdef CONFIG_FIREWALL 303 304 if ((err=call_in_firewall(PF_INET, skb, iph))<FW_ACCEPT) 305 { 306 if(err==FW_REJECT) 307 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev); 308 kfree_skb(skb, FREE_WRITE); 309 return 0; 310 } 311 312 #endif 313 314 /* 315 * Remember if the frame is fragmented. 316 */ 317 318 if(iph->frag_off) 319 { 320 if (iph->frag_off & htons(IP_MF)) 321 is_frag|=IPFWD_FRAGMENT; 322 /* 323 * Last fragment ? 324 */ 325 326 if (iph->frag_off & htons(IP_OFFSET)) 327 is_frag|=IPFWD_LASTFRAG; 328 } 329 330 /* 331 * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday. 332 * 333 * This is inefficient. While finding out if it is for us we could also compute 334 * the routing table entry. This is where the great unified cache theory comes 335 * in as and when someone implements it 336 * 337 * For most hosts over 99% of packets match the first conditional 338 * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at 339 * function entry. 340 */ 341 daddr = iph->daddr; 342 if ( iph->daddr == skb->dev->pa_addr || (brd = ip_chk_addr(iph->daddr)) != 0) 343 { 344 if (opt && opt->srr) 345 { 346 int srrspace, srrptr; 347 __u32 nexthop; 348 unsigned char * optptr = ((unsigned char *)iph) + opt->srr; 349 350 if (brd != IS_MYADDR || skb->pkt_type != PACKET_HOST) 351 { 352 kfree_skb(skb, FREE_WRITE); 353 return 0; 354 } 355 356 for ( srrptr=optptr[2], srrspace = optptr[1]; 357 srrptr <= srrspace; 358 srrptr += 4 359 ) 360 { 361 int brd2; 362 if (srrptr + 3 > srrspace) 363 { 364 icmp_send(skb, ICMP_PARAMETERPROB, 0, opt->srr+2, 365 skb->dev); 366 kfree_skb(skb, FREE_WRITE); 367 return 0; 368 } 369 memcpy(&nexthop, &optptr[srrptr-1], 4); 370 if ((brd2 = ip_chk_addr(nexthop)) == 0) 371 break; 372 if (brd2 != IS_MYADDR) 373 { 374 375 /* 376 * ANK: should we implement weak tunneling of multicasts? 377 * Are they obsolete? DVMRP specs (RFC-1075) is old enough... 378 * [They are obsolete] 379 */ 380 kfree_skb(skb, FREE_WRITE); 381 return -EINVAL; 382 } 383 memcpy(&daddr, &optptr[srrptr-1], 4); 384 } 385 if (srrptr <= srrspace) 386 { 387 opt->srr_is_hit = 1; 388 opt->is_changed = 1; 389 #ifdef CONFIG_IP_FORWARD 390 if (ip_forward(skb, dev, is_frag, nexthop)) 391 kfree_skb(skb, FREE_WRITE); 392 #else 393 ip_statistics.IpInAddrErrors++; 394 kfree_skb(skb, FREE_WRITE); 395 #endif 396 return 0; 397 } 398 } 399 400 #ifdef CONFIG_IP_MULTICAST 401 if(!(dev->flags&IFF_ALLMULTI) && brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK)) 402 { 403 /* 404 * Check it is for one of our groups 405 */ 406 struct ip_mc_list *ip_mc=dev->ip_mc_list; 407 do 408 { 409 if(ip_mc==NULL) 410 { 411 kfree_skb(skb, FREE_WRITE); 412 return 0; 413 } 414 if(ip_mc->multiaddr==iph->daddr) 415 break; 416 ip_mc=ip_mc->next; 417 } 418 while(1); 419 } 420 #endif 421 422 #ifdef CONFIG_IP_MASQUERADE 423 /* 424 * Do we need to de-masquerade this fragment? 425 */ 426 if (ip_fw_demasquerade(skb)) 427 { 428 struct iphdr *iph=skb->h.iph; 429 if (ip_forward(skb, dev, is_frag|IPFWD_MASQUERADED, iph->daddr)) 430 kfree_skb(skb, FREE_WRITE); 431 return(0); 432 } 433 #endif 434 435 /* 436 * Reassemble IP fragments. 437 */ 438 439 if(is_frag) 440 { 441 /* Defragment. Obtain the complete packet if there is one */ 442 skb=ip_defrag(iph,skb,dev); 443 if(skb==NULL) 444 return 0; 445 skb->dev = dev; 446 iph=skb->h.iph; 447 } 448 449 /* 450 * Point into the IP datagram, just past the header. 451 */ 452 453 skb->ip_hdr = iph; 454 skb->h.raw += iph->ihl*4; 455 456 #ifdef CONFIG_IP_MROUTE 457 /* 458 * Check the state on multicast routing (multicast and not 224.0.0.z) 459 */ 460 461 if(brd==IS_MULTICAST && (iph->daddr&htonl(0xFFFFFF00))!=htonl(0xE0000000)) 462 mroute_pkt=1; 463 464 #endif 465 /* 466 * Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies. 467 * 468 * RFC 1122: SHOULD pass TOS value up to the transport layer. 469 */ 470 471 hash = iph->protocol & (SOCK_ARRAY_SIZE-1); 472 473 /* 474 * If there maybe a raw socket we must check - if not we don't care less 475 */ 476 477 if((raw_sk=raw_prot.sock_array[hash])!=NULL) 478 { 479 struct sock *sknext=NULL; 480 struct sk_buff *skb1; 481 raw_sk=get_sock_raw(raw_sk, iph->protocol, iph->saddr, iph->daddr); 482 if(raw_sk) /* Any raw sockets */ 483 { 484 do 485 { 486 /* Find the next */ 487 sknext=get_sock_raw(raw_sk->next, iph->protocol, iph->saddr, iph->daddr); 488 if(sknext) 489 skb1=skb_clone(skb, GFP_ATOMIC); 490 else 491 break; /* One pending raw socket left */ 492 if(skb1) 493 raw_rcv(raw_sk, skb1, dev, iph->saddr,daddr); 494 raw_sk=sknext; 495 } 496 while(raw_sk!=NULL); 497 498 /* 499 * Here either raw_sk is the last raw socket, or NULL if none 500 */ 501 502 /* 503 * We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy 504 */ 505 } 506 } 507 508 /* 509 * skb->h.raw now points at the protocol beyond the IP header. 510 */ 511 512 hash = iph->protocol & (MAX_INET_PROTOS -1); 513 for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next) 514 { 515 struct sk_buff *skb2; 516 517 if (ipprot->protocol != iph->protocol) 518 continue; 519 /* 520 * See if we need to make a copy of it. This will 521 * only be set if more than one protocol wants it. 522 * and then not for the last one. If there is a pending 523 * raw delivery wait for that 524 */ 525 526 #ifdef CONFIG_IP_MROUTE 527 if (ipprot->copy || raw_sk || mroute_pkt) 528 #else 529 if (ipprot->copy || raw_sk) 530 #endif 531 { 532 skb2 = skb_clone(skb, GFP_ATOMIC); 533 if(skb2==NULL) 534 continue; 535 } 536 else 537 { 538 skb2 = skb; 539 } 540 flag = 1; 541 542 /* 543 * Pass on the datagram to each protocol that wants it, 544 * based on the datagram protocol. We should really 545 * check the protocol handler's return values here... 546 */ 547 548 ipprot->handler(skb2, dev, opt, daddr, 549 (ntohs(iph->tot_len) - (iph->ihl * 4)), 550 iph->saddr, 0, ipprot); 551 } 552 553 /* 554 * All protocols checked. 555 * If this packet was a broadcast, we may *not* reply to it, since that 556 * causes (proven, grin) ARP storms and a leakage of memory (i.e. all 557 * ICMP reply messages get queued up for transmission...) 558 */ 559 560 #ifdef CONFIG_IP_MROUTE 561 /* 562 * Forward the last copy to the multicast router. If 563 * there is a pending raw deliery however make a copy 564 * and forward that. 565 */ 566 567 if(mroute_pkt) 568 { 569 flag=1; 570 if(raw_sk==NULL) 571 ipmr_forward(skb, is_frag); 572 else 573 { 574 struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); 575 if(skb2) 576 { 577 skb2->free=1; 578 ipmr_forward(skb2, is_frag); 579 } 580 } 581 } 582 #endif 583 584 if(raw_sk!=NULL) /* Shift to last raw user */ 585 raw_rcv(raw_sk, skb, dev, iph->saddr, daddr); 586 else if (!flag) /* Free and report errors */ 587 { 588 if (brd != IS_BROADCAST && brd!=IS_MULTICAST) 589 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev); 590 kfree_skb(skb, FREE_WRITE); 591 } 592 593 return(0); 594 } 595 596 /* 597 * Do any unicast IP forwarding required. 598 */ 599 600 /* 601 * Don't forward multicast or broadcast frames. 602 */ 603 604 if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST) 605 { 606 kfree_skb(skb,FREE_WRITE); 607 return 0; 608 } 609 610 /* 611 * The packet is for another target. Forward the frame 612 */ 613 614 #ifdef CONFIG_IP_FORWARD 615 if (opt && opt->is_strictroute) 616 { 617 icmp_send(skb, ICMP_PARAMETERPROB, 0, 16, skb->dev); 618 kfree_skb(skb, FREE_WRITE); 619 return -1; 620 } 621 if (ip_forward(skb, dev, is_frag, iph->daddr)) 622 kfree_skb(skb, FREE_WRITE); 623 #else 624 /* printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n", 625 iph->saddr,iph->daddr);*/ 626 ip_statistics.IpInAddrErrors++; 627 kfree_skb(skb, FREE_WRITE); 628 #endif 629 return(0); 630 } 631 632