1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: @(#)route.c 1.0.14 05/31/93
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 *
39 * This program is free software; you can redistribute it and/or
40 * modify it under the terms of the GNU General Public License
41 * as published by the Free Software Foundation; either version
42 * 2 of the License, or (at your option) any later version.
43 */
44
45 #include <asm/segment.h>
46 #include <asm/system.h>
47 #include <linux/types.h>
48 #include <linux/kernel.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/socket.h>
53 #include <linux/sockios.h>
54 #include <linux/errno.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <net/ip.h>
59 #include <net/protocol.h>
60 #include <net/route.h>
61 #include <net/tcp.h>
62 #include <linux/skbuff.h>
63 #include <net/sock.h>
64 #include <net/icmp.h>
65 #include <net/netlink.h>
66
67 /*
68 * The routing table list
69 */
70
71 static struct rtable *rt_base = NULL;
72 unsigned long rt_stamp = 1; /* Routing table version stamp for caches ( 0 is 'unset' ) */
73
74 /*
75 * Pointer to the loopback route
76 */
77
78 static struct rtable *rt_loopback = NULL;
79
80 /*
81 * Remove a routing table entry.
82 */
83
84 static int rt_del(__u32 dst, __u32 mask,
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
85 char *devname, __u32 gtw, short rt_flags, short metric)
86 {
87 struct rtable *r, **rp;
88 unsigned long flags;
89 int found=0;
90
91 rp = &rt_base;
92
93 /*
94 * This must be done with interrupts off because we could take
95 * an ICMP_REDIRECT.
96 */
97
98 save_flags(flags);
99 cli();
100 while((r = *rp) != NULL)
101 {
102 /*
103 * Make sure the destination and netmask match.
104 * metric, gateway and device are also checked
105 * if they were specified.
106 */
107 if (r->rt_dst != dst ||
108 (mask && r->rt_mask != mask) ||
109 (gtw && r->rt_gateway != gtw) ||
110 (metric >= 0 && r->rt_metric != metric) ||
111 (devname && strcmp((r->rt_dev)->name,devname) != 0) )
112 {
113 rp = &r->rt_next;
114 continue;
115 }
116 *rp = r->rt_next;
117
118 /*
119 * If we delete the loopback route update its pointer.
120 */
121
122 if (rt_loopback == r)
123 rt_loopback = NULL;
124 ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, rt_flags, metric, r->rt_dev->name);
125 kfree_s(r, sizeof(struct rtable));
126 found=1;
127 }
128 rt_stamp++; /* New table revision */
129
130 restore_flags(flags);
131
132 if(found)
133 return 0;
134 return -ESRCH;
135 }
136
137
138 /*
139 * Remove all routing table entries for a device. This is called when
140 * a device is downed.
141 */
142
143 void ip_rt_flush(struct device *dev)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
144 {
145 struct rtable *r;
146 struct rtable **rp;
147 unsigned long flags;
148
149 rp = &rt_base;
150 save_flags(flags);
151 cli();
152 while ((r = *rp) != NULL) {
153 if (r->rt_dev != dev) {
154 rp = &r->rt_next;
155 continue;
156 }
157 *rp = r->rt_next;
158 if (rt_loopback == r)
159 rt_loopback = NULL;
160 kfree_s(r, sizeof(struct rtable));
161 }
162 rt_stamp++; /* New table revision */
163 restore_flags(flags);
164 }
165
166 /*
167 * Used by 'rt_add()' when we can't get the netmask any other way..
168 *
169 * If the lower byte or two are zero, we guess the mask based on the
170 * number of zero 8-bit net numbers, otherwise we use the "default"
171 * masks judging by the destination address and our device netmask.
172 */
173
174 static __u32 unsigned long default_mask(__u32 dst)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
175 {
176 dst = ntohl(dst);
177 if (IN_CLASSA(dst))
178 return htonl(IN_CLASSA_NET);
179 if (IN_CLASSB(dst))
180 return htonl(IN_CLASSB_NET);
181 return htonl(IN_CLASSC_NET);
182 }
183
184
185 /*
186 * If no mask is specified then generate a default entry.
187 */
188
189 static __u32 guess_mask(__u32 dst, struct device * dev)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
190 {
191 __u32 mask;
192
193 if (!dst)
194 return 0;
195 mask = default_mask(dst);
196 if ((dst ^ dev->pa_addr) & mask)
197 return mask;
198 return dev->pa_mask;
199 }
200
201
202 /*
203 * Find the route entry through which our gateway will be reached
204 */
205
206 static inline struct device * get_gw_dev(__u32 gw)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
207 {
208 struct rtable * rt;
209
210 for (rt = rt_base ; ; rt = rt->rt_next)
211 {
212 if (!rt)
213 return NULL;
214 if ((gw ^ rt->rt_dst) & rt->rt_mask)
215 continue;
216 /*
217 * Gateways behind gateways are a no-no
218 */
219
220 if (rt->rt_flags & RTF_GATEWAY)
221 return NULL;
222 return rt->rt_dev;
223 }
224 }
225
226 /*
227 * Rewrote rt_add(), as the old one was weird - Linus
228 *
229 * This routine is used to update the IP routing table, either
230 * from the kernel (ICMP_REDIRECT) or via an ioctl call issued
231 * by the superuser.
232 */
233
234 void ip_rt_add(short flags, __u32 dst, __u32 mask,
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
235 __u32 gw, struct device *dev, unsigned short mtu,
236 unsigned long window, unsigned short irtt, short metric)
237 {
238 struct rtable *r, *rt;
239 struct rtable **rp;
240 unsigned long cpuflags;
241 int duplicate = 0;
242
243 /*
244 * A host is a unique machine and has no network bits.
245 */
246
247 if (flags & RTF_HOST)
248 {
249 mask = 0xffffffff;
250 }
251
252 /*
253 * Calculate the network mask
254 */
255
256 else if (!mask)
257 {
258 if (!((dst ^ dev->pa_addr) & dev->pa_mask))
259 {
260 mask = dev->pa_mask;
261 flags &= ~RTF_GATEWAY;
262 if (flags & RTF_DYNAMIC)
263 {
264 /*printk("Dynamic route to my own net rejected\n");*/
265 return;
266 }
267 }
268 else
269 mask = guess_mask(dst, dev);
270 dst &= mask;
271 }
272
273 /*
274 * A gateway must be reachable and not a local address
275 */
276
277 if (gw == dev->pa_addr)
278 flags &= ~RTF_GATEWAY;
279
280 if (flags & RTF_GATEWAY)
281 {
282 /*
283 * Don't try to add a gateway we can't reach..
284 */
285
286 if (dev != get_gw_dev(gw))
287 return;
288
289 flags |= RTF_GATEWAY;
290 }
291 else
292 gw = 0;
293
294 /*
295 * Allocate an entry and fill it in.
296 */
297
298 rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC);
299 if (rt == NULL)
300 {
301 return;
302 }
303 memset(rt, 0, sizeof(struct rtable));
304 rt->rt_flags = flags | RTF_UP;
305 rt->rt_dst = dst;
306 rt->rt_dev = dev;
307 rt->rt_gateway = gw;
308 rt->rt_mask = mask;
309 rt->rt_mss = dev->mtu - HEADER_SIZE;
310 rt->rt_metric = metric;
311 rt->rt_window = 0; /* Default is no clamping */
312
313 /* Are the MSS/Window valid ? */
314
315 if(rt->rt_flags & RTF_MSS)
316 rt->rt_mss = mtu;
317
318 if(rt->rt_flags & RTF_WINDOW)
319 rt->rt_window = window;
320 if(rt->rt_flags & RTF_IRTT)
321 rt->rt_irtt = irtt;
322
323 /*
324 * What we have to do is loop though this until we have
325 * found the first address which has a higher generality than
326 * the one in rt. Then we can put rt in right before it.
327 * The interrupts must be off for this process.
328 */
329
330 save_flags(cpuflags);
331 cli();
332
333 /*
334 * Remove old route if we are getting a duplicate.
335 */
336
337 rp = &rt_base;
338 while ((r = *rp) != NULL)
339 {
340 if (r->rt_dst != dst ||
341 r->rt_mask != mask)
342 {
343 rp = &r->rt_next;
344 continue;
345 }
346 if (r->rt_metric != metric && r->rt_gateway != gw)
347 {
348 duplicate = 1;
349 rp = &r->rt_next;
350 continue;
351 }
352 *rp = r->rt_next;
353 if (rt_loopback == r)
354 rt_loopback = NULL;
355 ip_netlink_msg(RTMSG_DELROUTE, dst,gw, mask, flags, metric, rt->rt_dev->name);
356 kfree_s(r, sizeof(struct rtable));
357 }
358
359 /*
360 * Add the new route
361 */
362
363 rp = &rt_base;
364 while ((r = *rp) != NULL) {
365 /*
366 * When adding a duplicate route, add it before
367 * the route with a higher metric.
368 */
369 if (duplicate &&
370 r->rt_dst == dst &&
371 r->rt_mask == mask &&
372 r->rt_metric > metric)
373 break;
374 else
375 /*
376 * Otherwise, just add it before the
377 * route with a higher generality.
378 */
379 if ((r->rt_mask & mask) != mask)
380 break;
381 rp = &r->rt_next;
382 }
383 rt->rt_next = r;
384 *rp = rt;
385
386 /*
387 * Update the loopback route
388 */
389
390 if ((rt->rt_dev->flags & IFF_LOOPBACK) && !rt_loopback)
391 rt_loopback = rt;
392
393 rt_stamp++; /* New table revision */
394
395 /*
396 * Restore the interrupts and return
397 */
398
399 restore_flags(cpuflags);
400 ip_netlink_msg(RTMSG_NEWROUTE, dst,gw, mask, flags, metric, rt->rt_dev->name);
401 return;
402 }
403
404
405 /*
406 * Check if a mask is acceptable.
407 */
408
409 static inline int bad_mask(__u32 mask, __u32 addr)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
410 {
411 if (addr & (mask = ~mask))
412 return 1;
413 mask = ntohl(mask);
414 if (mask & (mask+1))
415 return 1;
416 return 0;
417 }
418
419 /*
420 * Process a route add request from the user
421 */
422
423 static int rt_new(struct rtentry *r)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
424 {
425 int err;
426 char * devname;
427 struct device * dev = NULL;
428 unsigned long flags;
429 __u32 daddr, mask, gw;
430 short metric;
431
432 /*
433 * If a device is specified find it.
434 */
435
436 if ((devname = r->rt_dev) != NULL)
437 {
438 err = getname(devname, &devname);
439 if (err)
440 return err;
441 dev = dev_get(devname);
442 putname(devname);
443 if (!dev)
444 return -ENODEV;
445 }
446
447 /*
448 * If the device isn't INET, don't allow it
449 */
450
451 if (r->rt_dst.sa_family != AF_INET)
452 return -EAFNOSUPPORT;
453
454 /*
455 * Make local copies of the important bits
456 * We decrement the metric by one for BSD compatibility.
457 */
458
459 flags = r->rt_flags;
460 daddr = (__u32) ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr;
461 mask = (__u32) ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr;
462 gw = (__u32) ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr;
463 metric = r->rt_metric > 0 ? r->rt_metric - 1 : 0;
464
465 /*
466 * BSD emulation: Permits route add someroute gw one-of-my-addresses
467 * to indicate which iface. Not as clean as the nice Linux dev technique
468 * but people keep using it...
469 */
470
471 if (!dev && (flags & RTF_GATEWAY))
472 {
473 struct device *dev2;
474 for (dev2 = dev_base ; dev2 != NULL ; dev2 = dev2->next)
475 {
476 if ((dev2->flags & IFF_UP) && dev2->pa_addr == gw)
477 {
478 flags &= ~RTF_GATEWAY;
479 dev = dev2;
480 break;
481 }
482 }
483 }
484
485 /*
486 * Ignore faulty masks
487 */
488
489 if (bad_mask(mask, daddr))
490 mask=0;
491
492 /*
493 * Set the mask to nothing for host routes.
494 */
495
496 if (flags & RTF_HOST)
497 mask = 0xffffffff;
498 else if (mask && r->rt_genmask.sa_family != AF_INET)
499 return -EAFNOSUPPORT;
500
501 /*
502 * You can only gateway IP via IP..
503 */
504
505 if (flags & RTF_GATEWAY)
506 {
507 if (r->rt_gateway.sa_family != AF_INET)
508 return -EAFNOSUPPORT;
509 if (!dev)
510 dev = get_gw_dev(gw);
511 }
512 else if (!dev)
513 dev = ip_dev_check(daddr);
514
515 /*
516 * Unknown device.
517 */
518
519 if (dev == NULL)
520 return -ENETUNREACH;
521
522 /*
523 * Add the route
524 */
525
526 ip_rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric);
527 return 0;
528 }
529
530
531 /*
532 * Remove a route, as requested by the user.
533 */
534
535 static int rt_kill(struct rtentry *r)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
536 {
537 struct sockaddr_in *trg;
538 struct sockaddr_in *msk;
539 struct sockaddr_in *gtw;
540 char *devname;
541 int err;
542
543 trg = (struct sockaddr_in *) &r->rt_dst;
544 msk = (struct sockaddr_in *) &r->rt_genmask;
545 gtw = (struct sockaddr_in *) &r->rt_gateway;
546 if ((devname = r->rt_dev) != NULL)
547 {
548 err = getname(devname, &devname);
549 if (err)
550 return err;
551 }
552 /*
553 * metric can become negative here if it wasn't filled in
554 * but that's a fortunate accident; we really use that in rt_del.
555 */
556 err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, devname,
557 (__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1);
558 if ( devname != NULL )
559 putname(devname);
560 return err;
561 }
562
563
564 /*
565 * Called from the PROCfs module. This outputs /proc/net/route.
566 *
567 * We preserve the old format but pad the buffers out. This means that
568 * we can spin over the other entries as we read them. Remember the
569 * gated BGP4 code could need to read 60,000+ routes on occasion (thats
570 * about 7Mb of data). To do that ok we will need to also cache the
571 * last route we got to (reads will generally be following on from
572 * one another without gaps).
573 */
574
575 int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
576 {
577 struct rtable *r;
578 int len=0;
579 off_t pos=0;
580 off_t begin=0;
581 char temp[129];
582
583 if(offset<128)
584 {
585 sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
586 pos=128;
587 }
588
589 for (r = rt_base; r != NULL; r = r->rt_next)
590 {
591 /*
592 * Spin through entries until we are ready
593 */
594 if(pos+128<offset)
595 {
596 pos+=128;
597 continue;
598 }
599
600 sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u",
601 r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
602 r->rt_flags, r->rt_refcnt, r->rt_use, r->rt_metric,
603 (unsigned long)r->rt_mask, (int)r->rt_mss, r->rt_window, (int)r->rt_irtt);
604 sprintf(buffer+len,"%-127s\n",temp);
605 len+=128;
606 pos+=128;
607 if(pos<offset)
608 {
609 len=0;
610 begin=pos;
611 }
612 if(pos>offset+length)
613 break;
614 }
615
616 *start=buffer+(offset-begin);
617 len-=(offset-begin);
618 if(len>length)
619 len=length;
620 return len;
621 }
622
623 /*
624 * This is hackish, but results in better code. Use "-S" to see why.
625 */
626
627 #define early_out ({ goto no_route; 1; })
628
629 /*
630 * Route a packet. This needs to be fairly quick. Florian & Co.
631 * suggested a unified ARP and IP routing cache. Done right its
632 * probably a brilliant idea. I'd actually suggest a unified
633 * ARP/IP routing/Socket pointer cache. Volunteers welcome
634 */
635
636 struct rtable * ip_rt_route(__u32 daddr, struct options *opt, __u32 *src_addr)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
637 {
638 struct rtable *rt;
639
640 for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next)
641 {
642 if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
643 break;
644 /*
645 * broadcast addresses can be special cases..
646 */
647 if (rt->rt_flags & RTF_GATEWAY)
648 continue;
649 if ((rt->rt_dev->flags & IFF_BROADCAST) &&
650 (rt->rt_dev->pa_brdaddr == daddr))
651 break;
652 }
653
654 if(rt->rt_flags&RTF_REJECT)
655 return NULL;
656
657 if(src_addr!=NULL)
658 *src_addr= rt->rt_dev->pa_addr;
659
660 if (daddr == rt->rt_dev->pa_addr) {
661 if ((rt = rt_loopback) == NULL)
662 goto no_route;
663 }
664 rt->rt_use++;
665 return rt;
666 no_route:
667 return NULL;
668 }
669
670 struct rtable * ip_rt_local(__u32 daddr, struct options *opt, __u32 *src_addr)
/* ![[previous]](../icons/left.png)
![[next]](../icons/right.png)
![[first]](../icons/first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
671 {
672 struct rtable *rt;
673
674 for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next)
675 {
676 /*
677 * No routed addressing.
678 */
679 if (rt->rt_flags&RTF_GATEWAY)
680 continue;
681
682 if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
683 break;
684 /*
685 * broadcast addresses can be special cases..
686 */
687
688 if ((rt->rt_dev->flags & IFF_BROADCAST) &&
689 rt->rt_dev->pa_brdaddr == daddr)
690 break;
691 }
692
693 if(src_addr!=NULL)
694 *src_addr= rt->rt_dev->pa_addr;
695
696 if (daddr == rt->rt_dev->pa_addr) {
697 if ((rt = rt_loopback) == NULL)
698 goto no_route;
699 }
700 rt->rt_use++;
701 return rt;
702 no_route:
703 return NULL;
704 }
705
706 /*
707 * Handle IP routing ioctl calls. These are used to manipulate the routing tables
708 */
709
710 int ip_rt_ioctl(unsigned int cmd, void *arg)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
711 {
712 int err;
713 struct rtentry rt;
714
715 switch(cmd)
716 {
717 case SIOCADDRT: /* Add a route */
718 case SIOCDELRT: /* Delete a route */
719 if (!suser())
720 return -EPERM;
721 err=verify_area(VERIFY_READ, arg, sizeof(struct rtentry));
722 if (err)
723 return err;
724 memcpy_fromfs(&rt, arg, sizeof(struct rtentry));
725 return (cmd == SIOCDELRT) ? rt_kill(&rt) : rt_new(&rt);
726 }
727
728 return -EINVAL;
729 }