1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * The Internet Protocol (IP) module. 7 * 8 * Version: @(#)ip.c 1.0.16b 9/1/93 9 * 10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Donald Becker, <becker@super.org> 13 * Alan Cox, <Alan.Cox@linux.org> 14 * Richard Underwood 15 * Stefan Becker, <stefanb@yello.ping.de> 16 * Jorge Cwik, <jorge@laser.satlink.net> 17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 18 * 19 * 20 * Fixes: 21 * Alan Cox : Commented a couple of minor bits of surplus code 22 * Alan Cox : Undefining IP_FORWARD doesn't include the code 23 * (just stops a compiler warning). 24 * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes 25 * are junked rather than corrupting things. 26 * Alan Cox : Frames to bad broadcast subnets are dumped 27 * We used to process them non broadcast and 28 * boy could that cause havoc. 29 * Alan Cox : ip_forward sets the free flag on the 30 * new frame it queues. Still crap because 31 * it copies the frame but at least it 32 * doesn't eat memory too. 33 * Alan Cox : Generic queue code and memory fixes. 34 * Fred Van Kempen : IP fragment support (borrowed from NET2E) 35 * Gerhard Koerting: Forward fragmented frames correctly. 36 * Gerhard Koerting: Fixes to my fix of the above 8-). 37 * Gerhard Koerting: IP interface addressing fix. 38 * Linus Torvalds : More robustness checks 39 * Alan Cox : Even more checks: Still not as robust as it ought to be 40 * Alan Cox : Save IP header pointer for later 41 * Alan Cox : ip option setting 42 * Alan Cox : Use ip_tos/ip_ttl settings 43 * Alan Cox : Fragmentation bogosity removed 44 * (Thanks to Mark.Bush@prg.ox.ac.uk) 45 * Dmitry Gorodchanin : Send of a raw packet crash fix. 46 * Alan Cox : Silly ip bug when an overlength 47 * fragment turns up. Now frees the 48 * queue. 49 * Linus Torvalds/ : Memory leakage on fragmentation 50 * Alan Cox : handling. 51 * Gerhard Koerting: Forwarding uses IP priority hints 52 * Teemu Rantanen : Fragment problems. 53 * Alan Cox : General cleanup, comments and reformat 54 * Alan Cox : SNMP statistics 55 * Alan Cox : BSD address rule semantics. Also see 56 * UDP as there is a nasty checksum issue 57 * if you do things the wrong way. 58 * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file 59 * Alan Cox : IP options adjust sk->priority. 60 * Pedro Roque : Fix mtu/length error in ip_forward. 61 * Alan Cox : Avoid ip_chk_addr when possible. 62 * Richard Underwood : IP multicasting. 63 * Alan Cox : Cleaned up multicast handlers. 64 * Alan Cox : RAW sockets demultiplex in the BSD style. 65 * Gunther Mayer : Fix the SNMP reporting typo 66 * Alan Cox : Always in group 224.0.0.1 67 * Pauline Middelink : Fast ip_checksum update when forwarding 68 * Masquerading support. 69 * Alan Cox : Multicast loopback error for 224.0.0.1 70 * Alan Cox : IP_MULTICAST_LOOP option. 71 * Alan Cox : Use notifiers. 72 * Bjorn Ekwall : Removed ip_csum (from slhc.c too) 73 * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) 74 * Stefan Becker : Send out ICMP HOST REDIRECT 75 * Arnt Gulbrandsen : ip_build_xmit 76 * Alan Cox : Per socket routing cache 77 * Alan Cox : Fixed routing cache, added header cache. 78 * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. 79 * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. 80 * Alan Cox : Incoming IP option handling. 81 * Alan Cox : Set saddr on raw output frames as per BSD. 82 * Alan Cox : Stopped broadcast source route explosions. 83 * Alan Cox : Can disable source routing 84 * Takeshi Sone : Masquerading didn't work. 85 * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. 86 * Alan Cox : Memory leaks, tramples, misc debugging. 87 * Alan Cox : Fixed multicast (by popular demand 8)) 88 * Alan Cox : Fixed forwarding (by even more popular demand 8)) 89 * Alan Cox : Fixed SNMP statistics [I think] 90 * Gerhard Koerting : IP fragmentation forwarding fix 91 * Alan Cox : Device lock against page fault. 92 * Alan Cox : IP_HDRINCL facility. 93 * Werner Almesberger : Zero fragment bug 94 * Alan Cox : RAW IP frame length bug 95 * Alan Cox : Outgoing firewall on build_xmit 96 * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel 97 * Alan Cox : Multicast routing hooks 98 * Jos Vos : Do accounting *before* call_in_firewall 99 * 100 * 101 * 102 * To Fix: 103 * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient 104 * and could be made very efficient with the addition of some virtual memory hacks to permit 105 * the allocation of a buffer that can then be 'grown' by twiddling page tables. 106 * Output fragmentation wants updating along with the buffer management to use a single 107 * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet 108 * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause 109 * fragmentation anyway. 110 * 111 * FIXME: copy frag 0 iph to qp->iph 112 * 113 * This program is free software; you can redistribute it and/or 114 * modify it under the terms of the GNU General Public License 115 * as published by the Free Software Foundation; either version 116 * 2 of the License, or (at your option) any later version. 117 */ 118
119 #include <asm/segment.h>
120 #include <asm/system.h>
121 #include <linux/types.h>
122 #include <linux/kernel.h>
123 #include <linux/sched.h>
124 #include <linux/mm.h>
125 #include <linux/string.h>
126 #include <linux/errno.h>
127 #include <linux/config.h>
128
129 #include <linux/socket.h>
130 #include <linux/sockios.h>
131 #include <linux/in.h>
132 #include <linux/inet.h>
133 #include <linux/netdevice.h>
134 #include <linux/etherdevice.h>
135 #include <linux/proc_fs.h>
136 #include <linux/stat.h>
137
138 #include <net/snmp.h>
139 #include <net/ip.h>
140 #include <net/protocol.h>
141 #include <net/route.h>
142 #include <net/tcp.h>
143 #include <net/udp.h>
144 #include <linux/skbuff.h>
145 #include <net/sock.h>
146 #include <net/arp.h>
147 #include <net/icmp.h>
148 #include <net/raw.h>
149 #include <net/checksum.h>
150 #include <linux/igmp.h>
151 #include <linux/ip_fw.h>
152 #ifdefCONFIG_IP_MASQUERADE 153 #include <net/ip_masq.h>
154 #endif 155 #include <linux/firewall.h>
156 #include <linux/mroute.h>
157 #include <net/netlink.h>
158 #ifdefCONFIG_NET_ALIAS 159 #include <linux/net_alias.h>
160 #endif 161
162 externintlast_retran;
163 externvoid sort_send(structsock *sk);
164
165 #definemin(a,b) ((a)<(b)?(a):(b))
166
167 /* 168 * SNMP management statistics 169 */ 170
171 #ifdefCONFIG_IP_FORWARD 172 structip_mibip_statistics={1,64,}; /* Forwarding=Yes, Default TTL=64 */ 173 #else 174 structip_mibip_statistics={2,64,}; /* Forwarding=No, Default TTL=64 */ 175 #endif 176
177 /* 178 * Handle the issuing of an ioctl() request 179 * for the ip device. This is scheduled to 180 * disappear 181 */ 182
183 intip_ioctl(structsock *sk, intcmd, unsignedlongarg)
/* */ 184 { 185 switch(cmd)
186 { 187 default:
188 return(-EINVAL);
189 } 190 } 191
192
193
194 /* 195 * This function receives all incoming IP datagrams. 196 * 197 * On entry skb->data points to the start of the IP header and 198 * the MAC header has been removed. 199 */ 200
201 intip_rcv(structsk_buff *skb, structdevice *dev, structpacket_type *pt)
/* */ 202 { 203 structiphdr *iph = skb->h.iph;
204 structsock *raw_sk=NULL;
205 unsignedcharhash;
206 unsignedcharflag = 0;
207 structinet_protocol *ipprot;
208 intbrd=IS_MYADDR;
209 structoptions * opt = NULL;
210 intis_frag=0;
211 __u32daddr;
212
213 #ifdefCONFIG_FIREWALL 214 interr;
215 #endif 216 #ifdefCONFIG_IP_MROUTE 217 intmroute_pkt=0;
218 #endif 219
220 #ifdef CONFIG_NET_IPV6
221 /* 222 * Intercept IPv6 frames. We dump ST-II and invalid types just below.. 223 */ 224
225 if(iph->version == 6)
226 return ipv6_rcv(skb,dev,pt);
227 #endif 228
229 ip_statistics.IpInReceives++;
230
231 /* 232 * Tag the ip header of this packet so we can find it 233 */ 234
235 skb->ip_hdr = iph;
236
237 /* 238 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. 239 * RFC1122: 3.1.2.3 MUST discard a frame with invalid source address [NEEDS FIXING]. 240 * 241 * Is the datagram acceptable? 242 * 243 * 1. Length at least the size of an ip header 244 * 2. Version of 4 245 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] 246 * 4. Doesn't have a bogus length 247 * (5. We ought to check for IP multicast addresses and undefined types.. does this matter ?) 248 */ 249
250 if (skb->len<sizeof(structiphdr) || iph->ihl<5 || iph->version != 4 || ip_fast_csum((unsignedchar *)iph, iph->ihl) !=0
251 || skb->len < ntohs(iph->tot_len))
252 { 253 ip_statistics.IpInHdrErrors++;
254 kfree_skb(skb, FREE_WRITE);
255 return(0);
256 } 257
258 /* 259 * Our transport medium may have padded the buffer out. Now we know it 260 * is IP we can trim to the true length of the frame. 261 * Note this now means skb->len holds ntohs(iph->tot_len). 262 */ 263
264 skb_trim(skb,ntohs(iph->tot_len));
265
266 /* 267 * Try to select closest <src,dst> alias device, if any. 268 * net_alias_dev_rcv_sel32 returns main device if it 269 * fails to found other. 270 */ 271
272 #ifdefCONFIG_NET_ALIAS 273 if (iph->daddr != skb->dev->pa_addr && net_alias_has(skb->dev))
274 skb->dev = dev = net_alias_dev_rcv_sel32(skb->dev, AF_INET, iph->saddr, iph->daddr);
275 #endif 276
277 if (iph->ihl > 5)
278 { 279 skb->ip_summed = 0;
280 if (ip_options_compile(NULL, skb))
281 return(0);
282 opt = (structoptions*)skb->proto_priv;
283 #ifdef CONFIG_IP_NOSR
284 if (opt->srr)
285 { 286 kfree_skb(skb, FREE_READ);
287 return -EINVAL;
288 } 289 #endif 290 } 291
292 /* 293 * Account for the packet (even if the packet is 294 * not accepted by the firewall!). 295 */ 296
297 #ifdefCONFIG_IP_ACCT 298 ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1);
299 #endif 300
301 /* 302 * See if the firewall wants to dispose of the packet. 303 */ 304
305 #ifdefCONFIG_FIREWALL 306
307 if ((err=call_in_firewall(PF_INET, skb->dev, iph))<FW_ACCEPT)
308 { 309 if(err==FW_REJECT)
310 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev);
311 kfree_skb(skb, FREE_WRITE);
312 return 0;
313 } 314
315 #endif 316
317 /* 318 * Remember if the frame is fragmented. 319 */ 320
321 if(iph->frag_off)
322 { 323 if (iph->frag_off & htons(IP_MF))
324 is_frag|=IPFWD_FRAGMENT;
325 /* 326 * Last fragment ? 327 */ 328
329 if (iph->frag_off & htons(IP_OFFSET))
330 is_frag|=IPFWD_LASTFRAG;
331 } 332
333 /* 334 * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday. 335 * 336 * This is inefficient. While finding out if it is for us we could also compute 337 * the routing table entry. This is where the great unified cache theory comes 338 * in as and when someone implements it 339 * 340 * For most hosts over 99% of packets match the first conditional 341 * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at 342 * function entry. 343 */ 344 daddr = iph->daddr;
345 if ( iph->daddr == skb->dev->pa_addr || (brd = ip_chk_addr(iph->daddr)) != 0)
346 { 347 if (opt && opt->srr)
348 { 349 intsrrspace, srrptr;
350 __u32nexthop;
351 unsignedchar * optptr = ((unsignedchar *)iph) + opt->srr;
352
353 if (brd != IS_MYADDR || skb->pkt_type != PACKET_HOST)
354 { 355 kfree_skb(skb, FREE_WRITE);
356 return 0;
357 } 358
359 for ( srrptr=optptr[2], srrspace = optptr[1];
360 srrptr <= srrspace;
361 srrptr += 4
362 )
363 { 364 intbrd2;
365 if (srrptr + 3 > srrspace)
366 { 367 icmp_send(skb, ICMP_PARAMETERPROB, 0, opt->srr+2,
368 skb->dev);
369 kfree_skb(skb, FREE_WRITE);
370 return 0;
371 } 372 memcpy(&nexthop, &optptr[srrptr-1], 4);
373 if ((brd2 = ip_chk_addr(nexthop)) == 0)
374 break;
375 if (brd2 != IS_MYADDR)
376 { 377
378 /* 379 * ANK: should we implement weak tunneling of multicasts? 380 * Are they obsolete? DVMRP specs (RFC-1075) is old enough... 381 * [They are obsolete] 382 */ 383 kfree_skb(skb, FREE_WRITE);
384 return -EINVAL;
385 } 386 memcpy(&daddr, &optptr[srrptr-1], 4);
387 } 388 if (srrptr <= srrspace)
389 { 390 opt->srr_is_hit = 1;
391 opt->is_changed = 1;
392 #ifdefCONFIG_IP_FORWARD 393 if (ip_forward(skb, dev, is_frag, nexthop))
394 kfree_skb(skb, FREE_WRITE);
395 #else 396 ip_statistics.IpInAddrErrors++;
397 kfree_skb(skb, FREE_WRITE);
398 #endif 399 return 0;
400 } 401 } 402
403 #ifdefCONFIG_IP_MULTICAST 404 if(!(dev->flags&IFF_ALLMULTI) && brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK))
405 { 406 /* 407 * Check it is for one of our groups 408 */ 409 structip_mc_list *ip_mc=dev->ip_mc_list;
410 do 411 { 412 if(ip_mc==NULL)
413 { 414 kfree_skb(skb, FREE_WRITE);
415 return 0;
416 } 417 if(ip_mc->multiaddr==iph->daddr)
418 break;
419 ip_mc=ip_mc->next;
420 } 421 while(1);
422 } 423 #endif 424
425 #ifdefCONFIG_IP_MASQUERADE 426 /* 427 * Do we need to de-masquerade this fragment? 428 */ 429 if (ip_fw_demasquerade(&skb,dev))
430 { 431 structiphdr *iph=skb->h.iph;
432 if (ip_forward(skb, dev, is_frag|IPFWD_MASQUERADED, iph->daddr))
433 kfree_skb(skb, FREE_WRITE);
434 return(0);
435 } 436 #endif 437
438 /* 439 * Reassemble IP fragments. 440 */ 441
442 if(is_frag)
443 { 444 /* Defragment. Obtain the complete packet if there is one */ 445 skb=ip_defrag(iph,skb,dev);
446 if(skb==NULL)
447 return 0;
448 skb->dev = dev;
449 iph=skb->h.iph;
450 #ifdefCONFIG_IP_MASQUERADE 451 if (ip_fw_demasquerade(&skb,dev))
452 { 453 structiphdr *iph=skb->h.iph;
454 if (ip_forward(skb, dev, IPFWD_MASQUERADED, iph->daddr))
455 kfree_skb(skb, FREE_WRITE);
456 return 0;
457 } 458 #endif 459 } 460
461 /* 462 * Point into the IP datagram, just past the header. 463 */ 464
465 skb->ip_hdr = iph;
466 skb->h.raw += iph->ihl*4;
467
468 #ifdefCONFIG_IP_MROUTE 469 /* 470 * Check the state on multicast routing (multicast and not 224.0.0.z) 471 */ 472
473 if(brd==IS_MULTICAST && (iph->daddr&htonl(0xFFFFFF00))!=htonl(0xE0000000))
474 mroute_pkt=1;
475
476 #endif 477 /* 478 * Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies. 479 * 480 * RFC 1122: SHOULD pass TOS value up to the transport layer. 481 */ 482
483 hash = iph->protocol & (SOCK_ARRAY_SIZE-1);
484
485 /* 486 * If there maybe a raw socket we must check - if not we don't care less 487 */ 488
489 if((raw_sk=raw_prot.sock_array[hash])!=NULL)
490 { 491 structsock *sknext=NULL;
492 structsk_buff *skb1;
493 raw_sk=get_sock_raw(raw_sk, iph->protocol, iph->saddr, iph->daddr);
494 if(raw_sk) /* Any raw sockets */ 495 { 496 do 497 { 498 /* Find the next */ 499 sknext=get_sock_raw(raw_sk->next, iph->protocol, iph->saddr, iph->daddr);
500 if(sknext)
501 skb1=skb_clone(skb, GFP_ATOMIC);
502 else 503 break; /* One pending raw socket left */ 504 if(skb1)
505 raw_rcv(raw_sk, skb1, dev, iph->saddr,daddr);
506 raw_sk=sknext;
507 } 508 while(raw_sk!=NULL);
509
510 /* 511 * Here either raw_sk is the last raw socket, or NULL if none 512 */ 513
514 /* 515 * We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy 516 */ 517 } 518 } 519
520 /* 521 * skb->h.raw now points at the protocol beyond the IP header. 522 */ 523
524 hash = iph->protocol & (MAX_INET_PROTOS -1);
525 for (ipprot = (structinet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(structinet_protocol *)ipprot->next)
526 { 527 structsk_buff *skb2;
528
529 if (ipprot->protocol != iph->protocol)
530 continue;
531 /* 532 * See if we need to make a copy of it. This will 533 * only be set if more than one protocol wants it. 534 * and then not for the last one. If there is a pending 535 * raw delivery wait for that 536 */ 537
538 #ifdefCONFIG_IP_MROUTE 539 if (ipprot->copy || raw_sk || mroute_pkt)
540 #else 541 if (ipprot->copy || raw_sk)
542 #endif 543 { 544 skb2 = skb_clone(skb, GFP_ATOMIC);
545 if(skb2==NULL)
546 continue;
547 } 548 else 549 { 550 skb2 = skb;
551 } 552 flag = 1;
553
554 /* 555 * Pass on the datagram to each protocol that wants it, 556 * based on the datagram protocol. We should really 557 * check the protocol handler's return values here... 558 */ 559
560 ipprot->handler(skb2, dev, opt, daddr,
561 (ntohs(iph->tot_len) - (iph->ihl * 4)),
562 iph->saddr, 0, ipprot);
563 } 564
565 /* 566 * All protocols checked. 567 * If this packet was a broadcast, we may *not* reply to it, since that 568 * causes (proven, grin) ARP storms and a leakage of memory (i.e. all 569 * ICMP reply messages get queued up for transmission...) 570 */ 571
572 #ifdefCONFIG_IP_MROUTE 573 /* 574 * Forward the last copy to the multicast router. If 575 * there is a pending raw delivery however make a copy 576 * and forward that. 577 */ 578
579 if(mroute_pkt)
580 { 581 flag=1;
582 if(raw_sk==NULL)
583 ipmr_forward(skb, is_frag);
584 else 585 { 586 structsk_buff *skb2=skb_clone(skb, GFP_ATOMIC);
587 if(skb2)
588 { 589 skb2->free=1;
590 ipmr_forward(skb2, is_frag);
591 } 592 } 593 } 594 #endif 595
596 if(raw_sk!=NULL) /* Shift to last raw user */ 597 raw_rcv(raw_sk, skb, dev, iph->saddr, daddr);
598 elseif (!flag) /* Free and report errors */ 599 { 600 if (brd != IS_BROADCAST && brd!=IS_MULTICAST)
601 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev);
602 kfree_skb(skb, FREE_WRITE);
603 } 604
605 return(0);
606 } 607
608 /* 609 * Do any unicast IP forwarding required. 610 */ 611
612 /* 613 * Don't forward multicast or broadcast frames. 614 */ 615
616 if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST)
617 { 618 kfree_skb(skb,FREE_WRITE);
619 return 0;
620 } 621
622 /* 623 * The packet is for another target. Forward the frame 624 */ 625
626 #ifdefCONFIG_IP_FORWARD 627 if (opt && opt->is_strictroute)
628 { 629 icmp_send(skb, ICMP_PARAMETERPROB, 0, 16, skb->dev);
630 kfree_skb(skb, FREE_WRITE);
631 return -1;
632 } 633 if (ip_forward(skb, dev, is_frag, iph->daddr))
634 kfree_skb(skb, FREE_WRITE);
635 #else 636 /* printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n", 637 iph->saddr,iph->daddr);*/ 638 ip_statistics.IpInAddrErrors++;
639 kfree_skb(skb, FREE_WRITE);
640 #endif 641 return(0);
642 } 643
644