]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv4/route.c
45651834e1e2193bcc537b18e04946dc36e194d6
[linux-2.6-omap-h63xx.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static void rt_check_expire(struct work_struct *work);
141 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
142 static struct timer_list rt_secret_timer;
143
144 /*
145  *      Interface to generic destination cache.
146  */
147
148 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149 static void              ipv4_dst_destroy(struct dst_entry *dst);
150 static void              ipv4_dst_ifdown(struct dst_entry *dst,
151                                          struct net_device *dev, int how);
152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153 static void              ipv4_link_failure(struct sk_buff *skb);
154 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155 static int rt_garbage_collect(void);
156
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .protocol =             __constant_htons(ETH_P_IP),
161         .gc =                   rt_garbage_collect,
162         .check =                ipv4_dst_check,
163         .destroy =              ipv4_dst_destroy,
164         .ifdown =               ipv4_dst_ifdown,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .entry_size =           sizeof(struct rtable),
169 };
170
171 #define ECN_OR_COST(class)      TC_PRIO_##class
172
173 const __u8 ip_tos2prio[16] = {
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(FILLER),
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK)
190 };
191
192
193 /*
194  * Route cache.
195  */
196
197 /* The locking scheme is rather straight forward:
198  *
199  * 1) Read-Copy Update protects the buckets of the central route hash.
200  * 2) Only writers remove entries, and they hold the lock
201  *    as they look at rtable reference counts.
202  * 3) Only readers acquire references to rtable entries,
203  *    they do so with atomic increments and with the
204  *    lock held.
205  */
206
207 struct rt_hash_bucket {
208         struct rtable   *chain;
209 };
210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211         defined(CONFIG_PROVE_LOCKING)
212 /*
213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
214  * The size of this table is a power of two and depends on the number of CPUS.
215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216  */
217 #ifdef CONFIG_LOCKDEP
218 # define RT_HASH_LOCK_SZ        256
219 #else
220 # if NR_CPUS >= 32
221 #  define RT_HASH_LOCK_SZ       4096
222 # elif NR_CPUS >= 16
223 #  define RT_HASH_LOCK_SZ       2048
224 # elif NR_CPUS >= 8
225 #  define RT_HASH_LOCK_SZ       1024
226 # elif NR_CPUS >= 4
227 #  define RT_HASH_LOCK_SZ       512
228 # else
229 #  define RT_HASH_LOCK_SZ       256
230 # endif
231 #endif
232
233 static spinlock_t       *rt_hash_locks;
234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235 # define rt_hash_lock_init()    { \
236                 int i; \
237                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240                         spin_lock_init(&rt_hash_locks[i]); \
241                 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 # define rt_hash_lock_init()
245 #endif
246
247 static struct rt_hash_bucket    *rt_hash_table;
248 static unsigned                 rt_hash_mask;
249 static unsigned int             rt_hash_log;
250 static unsigned int             rt_hash_rnd;
251
252 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
253 #define RT_CACHE_STAT_INC(field) \
254         (__raw_get_cpu_var(rt_cache_stat).field++)
255
256 static int rt_intern_hash(unsigned hash, struct rtable *rth,
257                                 struct rtable **res);
258
259 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 {
261         return (jhash_2words(daddr, saddr, rt_hash_rnd)
262                 & rt_hash_mask);
263 }
264
265 #define rt_hash(daddr, saddr, idx) \
266         rt_hash_code((__force u32)(__be32)(daddr),\
267                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268
269 #ifdef CONFIG_PROC_FS
270 struct rt_cache_iter_state {
271         int bucket;
272 };
273
274 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 {
276         struct rtable *r = NULL;
277         struct rt_cache_iter_state *st = seq->private;
278
279         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280                 rcu_read_lock_bh();
281                 r = rt_hash_table[st->bucket].chain;
282                 if (r)
283                         break;
284                 rcu_read_unlock_bh();
285         }
286         return r;
287 }
288
289 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 {
291         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292
293         r = r->u.dst.rt_next;
294         while (!r) {
295                 rcu_read_unlock_bh();
296                 if (--st->bucket < 0)
297                         break;
298                 rcu_read_lock_bh();
299                 r = rt_hash_table[st->bucket].chain;
300         }
301         return r;
302 }
303
304 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 {
306         struct rtable *r = rt_cache_get_first(seq);
307
308         if (r)
309                 while (pos && (r = rt_cache_get_next(seq, r)))
310                         --pos;
311         return pos ? NULL : r;
312 }
313
314 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 {
316         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 }
318
319 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 {
321         struct rtable *r = NULL;
322
323         if (v == SEQ_START_TOKEN)
324                 r = rt_cache_get_first(seq);
325         else
326                 r = rt_cache_get_next(seq, v);
327         ++*pos;
328         return r;
329 }
330
331 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 {
333         if (v && v != SEQ_START_TOKEN)
334                 rcu_read_unlock_bh();
335 }
336
337 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 {
339         if (v == SEQ_START_TOKEN)
340                 seq_printf(seq, "%-127s\n",
341                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343                            "HHUptod\tSpecDst");
344         else {
345                 struct rtable *r = v;
346                 char temp[256];
347
348                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350                         r->u.dst.dev ? r->u.dst.dev->name : "*",
351                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
354                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356                         dst_metric(&r->u.dst, RTAX_WINDOW),
357                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
359                         r->fl.fl4_tos,
360                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362                                        dev_queue_xmit) : 0,
363                         r->rt_spec_dst);
364                 seq_printf(seq, "%-127s\n", temp);
365         }
366         return 0;
367 }
368
369 static const struct seq_operations rt_cache_seq_ops = {
370         .start  = rt_cache_seq_start,
371         .next   = rt_cache_seq_next,
372         .stop   = rt_cache_seq_stop,
373         .show   = rt_cache_seq_show,
374 };
375
376 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 {
378         return seq_open_private(file, &rt_cache_seq_ops,
379                         sizeof(struct rt_cache_iter_state));
380 }
381
382 static const struct file_operations rt_cache_seq_fops = {
383         .owner   = THIS_MODULE,
384         .open    = rt_cache_seq_open,
385         .read    = seq_read,
386         .llseek  = seq_lseek,
387         .release = seq_release_private,
388 };
389
390
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         int cpu;
394
395         if (*pos == 0)
396                 return SEQ_START_TOKEN;
397
398         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399                 if (!cpu_possible(cpu))
400                         continue;
401                 *pos = cpu+1;
402                 return &per_cpu(rt_cache_stat, cpu);
403         }
404         return NULL;
405 }
406
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409         int cpu;
410
411         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return &per_cpu(rt_cache_stat, cpu);
416         }
417         return NULL;
418
419 }
420
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423
424 }
425
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428         struct rt_cache_stat *st = v;
429
430         if (v == SEQ_START_TOKEN) {
431                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432                 return 0;
433         }
434
435         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437                    atomic_read(&ipv4_dst_ops.entries),
438                    st->in_hit,
439                    st->in_slow_tot,
440                    st->in_slow_mc,
441                    st->in_no_route,
442                    st->in_brd,
443                    st->in_martian_dst,
444                    st->in_martian_src,
445
446                    st->out_hit,
447                    st->out_slow_tot,
448                    st->out_slow_mc,
449
450                    st->gc_total,
451                    st->gc_ignored,
452                    st->gc_goal_miss,
453                    st->gc_dst_overflow,
454                    st->in_hlist_search,
455                    st->out_hlist_search
456                 );
457         return 0;
458 }
459
460 static const struct seq_operations rt_cpu_seq_ops = {
461         .start  = rt_cpu_seq_start,
462         .next   = rt_cpu_seq_next,
463         .stop   = rt_cpu_seq_stop,
464         .show   = rt_cpu_seq_show,
465 };
466
467
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470         return seq_open(file, &rt_cpu_seq_ops);
471 }
472
473 static const struct file_operations rt_cpu_seq_fops = {
474         .owner   = THIS_MODULE,
475         .open    = rt_cpu_seq_open,
476         .read    = seq_read,
477         .llseek  = seq_lseek,
478         .release = seq_release,
479 };
480
481 #endif /* CONFIG_PROC_FS */
482
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486 }
487
488 static __inline__ void rt_drop(struct rtable *rt)
489 {
490         ip_rt_put(rt);
491         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492 }
493
494 static __inline__ int rt_fast_clean(struct rtable *rth)
495 {
496         /* Kill broadcast/multicast entries very aggresively, if they
497            collide in hash table with more useful entries */
498         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
499                 rth->fl.iif && rth->u.dst.rt_next;
500 }
501
502 static __inline__ int rt_valuable(struct rtable *rth)
503 {
504         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505                 rth->u.dst.expires;
506 }
507
508 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509 {
510         unsigned long age;
511         int ret = 0;
512
513         if (atomic_read(&rth->u.dst.__refcnt))
514                 goto out;
515
516         ret = 1;
517         if (rth->u.dst.expires &&
518             time_after_eq(jiffies, rth->u.dst.expires))
519                 goto out;
520
521         age = jiffies - rth->u.dst.lastuse;
522         ret = 0;
523         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524             (age <= tmo2 && rt_valuable(rth)))
525                 goto out;
526         ret = 1;
527 out:    return ret;
528 }
529
530 /* Bits of score are:
531  * 31: very valuable
532  * 30: not quite useless
533  * 29..0: usage counter
534  */
535 static inline u32 rt_score(struct rtable *rt)
536 {
537         u32 score = jiffies - rt->u.dst.lastuse;
538
539         score = ~score & ~(3<<30);
540
541         if (rt_valuable(rt))
542                 score |= (1<<31);
543
544         if (!rt->fl.iif ||
545             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546                 score |= (1<<30);
547
548         return score;
549 }
550
551 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552 {
553         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
555                 (fl1->mark ^ fl2->mark) |
556                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
557                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
558                 (fl1->oif ^ fl2->oif) |
559                 (fl1->iif ^ fl2->iif)) == 0;
560 }
561
562 static void rt_check_expire(struct work_struct *work)
563 {
564         static unsigned int rover;
565         unsigned int i = rover, goal;
566         struct rtable *rth, **rthp;
567         u64 mult;
568
569         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570         if (ip_rt_gc_timeout > 1)
571                 do_div(mult, ip_rt_gc_timeout);
572         goal = (unsigned int)mult;
573         if (goal > rt_hash_mask)
574                 goal = rt_hash_mask + 1;
575         for (; goal > 0; goal--) {
576                 unsigned long tmo = ip_rt_gc_timeout;
577
578                 i = (i + 1) & rt_hash_mask;
579                 rthp = &rt_hash_table[i].chain;
580
581                 if (*rthp == NULL)
582                         continue;
583                 spin_lock_bh(rt_hash_lock_addr(i));
584                 while ((rth = *rthp) != NULL) {
585                         if (rth->u.dst.expires) {
586                                 /* Entry is expired even if it is in use */
587                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
588                                         tmo >>= 1;
589                                         rthp = &rth->u.dst.rt_next;
590                                         continue;
591                                 }
592                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
593                                 tmo >>= 1;
594                                 rthp = &rth->u.dst.rt_next;
595                                 continue;
596                         }
597
598                         /* Cleanup aged off entries. */
599                         *rthp = rth->u.dst.rt_next;
600                         rt_free(rth);
601                 }
602                 spin_unlock_bh(rt_hash_lock_addr(i));
603         }
604         rover = i;
605         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
606 }
607
608 /* This can run from both BH and non-BH contexts, the latter
609  * in the case of a forced flush event.
610  */
611 static void rt_run_flush(unsigned long dummy)
612 {
613         int i;
614         struct rtable *rth, *next;
615
616         rt_deadline = 0;
617
618         get_random_bytes(&rt_hash_rnd, 4);
619
620         for (i = rt_hash_mask; i >= 0; i--) {
621                 spin_lock_bh(rt_hash_lock_addr(i));
622                 rth = rt_hash_table[i].chain;
623                 if (rth)
624                         rt_hash_table[i].chain = NULL;
625                 spin_unlock_bh(rt_hash_lock_addr(i));
626
627                 for (; rth; rth = next) {
628                         next = rth->u.dst.rt_next;
629                         rt_free(rth);
630                 }
631         }
632 }
633
634 static DEFINE_SPINLOCK(rt_flush_lock);
635
636 void rt_cache_flush(int delay)
637 {
638         unsigned long now = jiffies;
639         int user_mode = !in_softirq();
640
641         if (delay < 0)
642                 delay = ip_rt_min_delay;
643
644         spin_lock_bh(&rt_flush_lock);
645
646         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
647                 long tmo = (long)(rt_deadline - now);
648
649                 /* If flush timer is already running
650                    and flush request is not immediate (delay > 0):
651
652                    if deadline is not achieved, prolongate timer to "delay",
653                    otherwise fire it at deadline time.
654                  */
655
656                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
657                         tmo = 0;
658
659                 if (delay > tmo)
660                         delay = tmo;
661         }
662
663         if (delay <= 0) {
664                 spin_unlock_bh(&rt_flush_lock);
665                 rt_run_flush(0);
666                 return;
667         }
668
669         if (rt_deadline == 0)
670                 rt_deadline = now + ip_rt_max_delay;
671
672         mod_timer(&rt_flush_timer, now+delay);
673         spin_unlock_bh(&rt_flush_lock);
674 }
675
676 static void rt_secret_rebuild(unsigned long dummy)
677 {
678         unsigned long now = jiffies;
679
680         rt_cache_flush(0);
681         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
682 }
683
684 /*
685    Short description of GC goals.
686
687    We want to build algorithm, which will keep routing cache
688    at some equilibrium point, when number of aged off entries
689    is kept approximately equal to newly generated ones.
690
691    Current expiration strength is variable "expire".
692    We try to adjust it dynamically, so that if networking
693    is idle expires is large enough to keep enough of warm entries,
694    and when load increases it reduces to limit cache size.
695  */
696
697 static int rt_garbage_collect(void)
698 {
699         static unsigned long expire = RT_GC_TIMEOUT;
700         static unsigned long last_gc;
701         static int rover;
702         static int equilibrium;
703         struct rtable *rth, **rthp;
704         unsigned long now = jiffies;
705         int goal;
706
707         /*
708          * Garbage collection is pretty expensive,
709          * do not make it too frequently.
710          */
711
712         RT_CACHE_STAT_INC(gc_total);
713
714         if (now - last_gc < ip_rt_gc_min_interval &&
715             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
716                 RT_CACHE_STAT_INC(gc_ignored);
717                 goto out;
718         }
719
720         /* Calculate number of entries, which we want to expire now. */
721         goal = atomic_read(&ipv4_dst_ops.entries) -
722                 (ip_rt_gc_elasticity << rt_hash_log);
723         if (goal <= 0) {
724                 if (equilibrium < ipv4_dst_ops.gc_thresh)
725                         equilibrium = ipv4_dst_ops.gc_thresh;
726                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
727                 if (goal > 0) {
728                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
729                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730                 }
731         } else {
732                 /* We are in dangerous area. Try to reduce cache really
733                  * aggressively.
734                  */
735                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
736                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
737         }
738
739         if (now - last_gc >= ip_rt_gc_min_interval)
740                 last_gc = now;
741
742         if (goal <= 0) {
743                 equilibrium += goal;
744                 goto work_done;
745         }
746
747         do {
748                 int i, k;
749
750                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
751                         unsigned long tmo = expire;
752
753                         k = (k + 1) & rt_hash_mask;
754                         rthp = &rt_hash_table[k].chain;
755                         spin_lock_bh(rt_hash_lock_addr(k));
756                         while ((rth = *rthp) != NULL) {
757                                 if (!rt_may_expire(rth, tmo, expire)) {
758                                         tmo >>= 1;
759                                         rthp = &rth->u.dst.rt_next;
760                                         continue;
761                                 }
762                                 *rthp = rth->u.dst.rt_next;
763                                 rt_free(rth);
764                                 goal--;
765                         }
766                         spin_unlock_bh(rt_hash_lock_addr(k));
767                         if (goal <= 0)
768                                 break;
769                 }
770                 rover = k;
771
772                 if (goal <= 0)
773                         goto work_done;
774
775                 /* Goal is not achieved. We stop process if:
776
777                    - if expire reduced to zero. Otherwise, expire is halfed.
778                    - if table is not full.
779                    - if we are called from interrupt.
780                    - jiffies check is just fallback/debug loop breaker.
781                      We will not spin here for long time in any case.
782                  */
783
784                 RT_CACHE_STAT_INC(gc_goal_miss);
785
786                 if (expire == 0)
787                         break;
788
789                 expire >>= 1;
790 #if RT_CACHE_DEBUG >= 2
791                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
792                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
793 #endif
794
795                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
796                         goto out;
797         } while (!in_softirq() && time_before_eq(jiffies, now));
798
799         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
800                 goto out;
801         if (net_ratelimit())
802                 printk(KERN_WARNING "dst cache overflow\n");
803         RT_CACHE_STAT_INC(gc_dst_overflow);
804         return 1;
805
806 work_done:
807         expire += ip_rt_gc_min_interval;
808         if (expire > ip_rt_gc_timeout ||
809             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
810                 expire = ip_rt_gc_timeout;
811 #if RT_CACHE_DEBUG >= 2
812         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
813                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
814 #endif
815 out:    return 0;
816 }
817
818 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
819 {
820         struct rtable   *rth, **rthp;
821         unsigned long   now;
822         struct rtable *cand, **candp;
823         u32             min_score;
824         int             chain_length;
825         int attempts = !in_softirq();
826
827 restart:
828         chain_length = 0;
829         min_score = ~(u32)0;
830         cand = NULL;
831         candp = NULL;
832         now = jiffies;
833
834         rthp = &rt_hash_table[hash].chain;
835
836         spin_lock_bh(rt_hash_lock_addr(hash));
837         while ((rth = *rthp) != NULL) {
838                 if (compare_keys(&rth->fl, &rt->fl)) {
839                         /* Put it first */
840                         *rthp = rth->u.dst.rt_next;
841                         /*
842                          * Since lookup is lockfree, the deletion
843                          * must be visible to another weakly ordered CPU before
844                          * the insertion at the start of the hash chain.
845                          */
846                         rcu_assign_pointer(rth->u.dst.rt_next,
847                                            rt_hash_table[hash].chain);
848                         /*
849                          * Since lookup is lockfree, the update writes
850                          * must be ordered for consistency on SMP.
851                          */
852                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
853
854                         dst_use(&rth->u.dst, now);
855                         spin_unlock_bh(rt_hash_lock_addr(hash));
856
857                         rt_drop(rt);
858                         *rp = rth;
859                         return 0;
860                 }
861
862                 if (!atomic_read(&rth->u.dst.__refcnt)) {
863                         u32 score = rt_score(rth);
864
865                         if (score <= min_score) {
866                                 cand = rth;
867                                 candp = rthp;
868                                 min_score = score;
869                         }
870                 }
871
872                 chain_length++;
873
874                 rthp = &rth->u.dst.rt_next;
875         }
876
877         if (cand) {
878                 /* ip_rt_gc_elasticity used to be average length of chain
879                  * length, when exceeded gc becomes really aggressive.
880                  *
881                  * The second limit is less certain. At the moment it allows
882                  * only 2 entries per bucket. We will see.
883                  */
884                 if (chain_length > ip_rt_gc_elasticity) {
885                         *candp = cand->u.dst.rt_next;
886                         rt_free(cand);
887                 }
888         }
889
890         /* Try to bind route to arp only if it is output
891            route or unicast forwarding path.
892          */
893         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
894                 int err = arp_bind_neighbour(&rt->u.dst);
895                 if (err) {
896                         spin_unlock_bh(rt_hash_lock_addr(hash));
897
898                         if (err != -ENOBUFS) {
899                                 rt_drop(rt);
900                                 return err;
901                         }
902
903                         /* Neighbour tables are full and nothing
904                            can be released. Try to shrink route cache,
905                            it is most likely it holds some neighbour records.
906                          */
907                         if (attempts-- > 0) {
908                                 int saved_elasticity = ip_rt_gc_elasticity;
909                                 int saved_int = ip_rt_gc_min_interval;
910                                 ip_rt_gc_elasticity     = 1;
911                                 ip_rt_gc_min_interval   = 0;
912                                 rt_garbage_collect();
913                                 ip_rt_gc_min_interval   = saved_int;
914                                 ip_rt_gc_elasticity     = saved_elasticity;
915                                 goto restart;
916                         }
917
918                         if (net_ratelimit())
919                                 printk(KERN_WARNING "Neighbour table overflow.\n");
920                         rt_drop(rt);
921                         return -ENOBUFS;
922                 }
923         }
924
925         rt->u.dst.rt_next = rt_hash_table[hash].chain;
926 #if RT_CACHE_DEBUG >= 2
927         if (rt->u.dst.rt_next) {
928                 struct rtable *trt;
929                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
930                        NIPQUAD(rt->rt_dst));
931                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
932                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
933                 printk("\n");
934         }
935 #endif
936         rt_hash_table[hash].chain = rt;
937         spin_unlock_bh(rt_hash_lock_addr(hash));
938         *rp = rt;
939         return 0;
940 }
941
942 void rt_bind_peer(struct rtable *rt, int create)
943 {
944         static DEFINE_SPINLOCK(rt_peer_lock);
945         struct inet_peer *peer;
946
947         peer = inet_getpeer(rt->rt_dst, create);
948
949         spin_lock_bh(&rt_peer_lock);
950         if (rt->peer == NULL) {
951                 rt->peer = peer;
952                 peer = NULL;
953         }
954         spin_unlock_bh(&rt_peer_lock);
955         if (peer)
956                 inet_putpeer(peer);
957 }
958
959 /*
960  * Peer allocation may fail only in serious out-of-memory conditions.  However
961  * we still can generate some output.
962  * Random ID selection looks a bit dangerous because we have no chances to
963  * select ID being unique in a reasonable period of time.
964  * But broken packet identifier may be better than no packet at all.
965  */
966 static void ip_select_fb_ident(struct iphdr *iph)
967 {
968         static DEFINE_SPINLOCK(ip_fb_id_lock);
969         static u32 ip_fallback_id;
970         u32 salt;
971
972         spin_lock_bh(&ip_fb_id_lock);
973         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
974         iph->id = htons(salt & 0xFFFF);
975         ip_fallback_id = salt;
976         spin_unlock_bh(&ip_fb_id_lock);
977 }
978
979 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
980 {
981         struct rtable *rt = (struct rtable *) dst;
982
983         if (rt) {
984                 if (rt->peer == NULL)
985                         rt_bind_peer(rt, 1);
986
987                 /* If peer is attached to destination, it is never detached,
988                    so that we need not to grab a lock to dereference it.
989                  */
990                 if (rt->peer) {
991                         iph->id = htons(inet_getid(rt->peer, more));
992                         return;
993                 }
994         } else
995                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
996                        __builtin_return_address(0));
997
998         ip_select_fb_ident(iph);
999 }
1000
1001 static void rt_del(unsigned hash, struct rtable *rt)
1002 {
1003         struct rtable **rthp;
1004
1005         spin_lock_bh(rt_hash_lock_addr(hash));
1006         ip_rt_put(rt);
1007         for (rthp = &rt_hash_table[hash].chain; *rthp;
1008              rthp = &(*rthp)->u.dst.rt_next)
1009                 if (*rthp == rt) {
1010                         *rthp = rt->u.dst.rt_next;
1011                         rt_free(rt);
1012                         break;
1013                 }
1014         spin_unlock_bh(rt_hash_lock_addr(hash));
1015 }
1016
1017 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1018                     __be32 saddr, struct net_device *dev)
1019 {
1020         int i, k;
1021         struct in_device *in_dev = in_dev_get(dev);
1022         struct rtable *rth, **rthp;
1023         __be32  skeys[2] = { saddr, 0 };
1024         int  ikeys[2] = { dev->ifindex, 0 };
1025         struct netevent_redirect netevent;
1026
1027         if (!in_dev)
1028                 return;
1029
1030         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1031             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1032                 goto reject_redirect;
1033
1034         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1035                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1036                         goto reject_redirect;
1037                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1038                         goto reject_redirect;
1039         } else {
1040                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1041                         goto reject_redirect;
1042         }
1043
1044         for (i = 0; i < 2; i++) {
1045                 for (k = 0; k < 2; k++) {
1046                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1047
1048                         rthp=&rt_hash_table[hash].chain;
1049
1050                         rcu_read_lock();
1051                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1052                                 struct rtable *rt;
1053
1054                                 if (rth->fl.fl4_dst != daddr ||
1055                                     rth->fl.fl4_src != skeys[i] ||
1056                                     rth->fl.oif != ikeys[k] ||
1057                                     rth->fl.iif != 0) {
1058                                         rthp = &rth->u.dst.rt_next;
1059                                         continue;
1060                                 }
1061
1062                                 if (rth->rt_dst != daddr ||
1063                                     rth->rt_src != saddr ||
1064                                     rth->u.dst.error ||
1065                                     rth->rt_gateway != old_gw ||
1066                                     rth->u.dst.dev != dev)
1067                                         break;
1068
1069                                 dst_hold(&rth->u.dst);
1070                                 rcu_read_unlock();
1071
1072                                 rt = dst_alloc(&ipv4_dst_ops);
1073                                 if (rt == NULL) {
1074                                         ip_rt_put(rth);
1075                                         in_dev_put(in_dev);
1076                                         return;
1077                                 }
1078
1079                                 /* Copy all the information. */
1080                                 *rt = *rth;
1081                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1082                                 rt->u.dst.__use         = 1;
1083                                 atomic_set(&rt->u.dst.__refcnt, 1);
1084                                 rt->u.dst.child         = NULL;
1085                                 if (rt->u.dst.dev)
1086                                         dev_hold(rt->u.dst.dev);
1087                                 if (rt->idev)
1088                                         in_dev_hold(rt->idev);
1089                                 rt->u.dst.obsolete      = 0;
1090                                 rt->u.dst.lastuse       = jiffies;
1091                                 rt->u.dst.path          = &rt->u.dst;
1092                                 rt->u.dst.neighbour     = NULL;
1093                                 rt->u.dst.hh            = NULL;
1094                                 rt->u.dst.xfrm          = NULL;
1095
1096                                 rt->rt_flags            |= RTCF_REDIRECTED;
1097
1098                                 /* Gateway is different ... */
1099                                 rt->rt_gateway          = new_gw;
1100
1101                                 /* Redirect received -> path was valid */
1102                                 dst_confirm(&rth->u.dst);
1103
1104                                 if (rt->peer)
1105                                         atomic_inc(&rt->peer->refcnt);
1106
1107                                 if (arp_bind_neighbour(&rt->u.dst) ||
1108                                     !(rt->u.dst.neighbour->nud_state &
1109                                             NUD_VALID)) {
1110                                         if (rt->u.dst.neighbour)
1111                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1112                                         ip_rt_put(rth);
1113                                         rt_drop(rt);
1114                                         goto do_next;
1115                                 }
1116
1117                                 netevent.old = &rth->u.dst;
1118                                 netevent.new = &rt->u.dst;
1119                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1120                                                         &netevent);
1121
1122                                 rt_del(hash, rth);
1123                                 if (!rt_intern_hash(hash, rt, &rt))
1124                                         ip_rt_put(rt);
1125                                 goto do_next;
1126                         }
1127                         rcu_read_unlock();
1128                 do_next:
1129                         ;
1130                 }
1131         }
1132         in_dev_put(in_dev);
1133         return;
1134
1135 reject_redirect:
1136 #ifdef CONFIG_IP_ROUTE_VERBOSE
1137         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1138                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1139                         "%u.%u.%u.%u ignored.\n"
1140                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1141                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1142                        NIPQUAD(saddr), NIPQUAD(daddr));
1143 #endif
1144         in_dev_put(in_dev);
1145 }
1146
1147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1148 {
1149         struct rtable *rt = (struct rtable*)dst;
1150         struct dst_entry *ret = dst;
1151
1152         if (rt) {
1153                 if (dst->obsolete) {
1154                         ip_rt_put(rt);
1155                         ret = NULL;
1156                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1157                            rt->u.dst.expires) {
1158                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1159                                                 rt->fl.oif);
1160 #if RT_CACHE_DEBUG >= 1
1161                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1162                                           "%u.%u.%u.%u/%02x dropped\n",
1163                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1164 #endif
1165                         rt_del(hash, rt);
1166                         ret = NULL;
1167                 }
1168         }
1169         return ret;
1170 }
1171
1172 /*
1173  * Algorithm:
1174  *      1. The first ip_rt_redirect_number redirects are sent
1175  *         with exponential backoff, then we stop sending them at all,
1176  *         assuming that the host ignores our redirects.
1177  *      2. If we did not see packets requiring redirects
1178  *         during ip_rt_redirect_silence, we assume that the host
1179  *         forgot redirected route and start to send redirects again.
1180  *
1181  * This algorithm is much cheaper and more intelligent than dumb load limiting
1182  * in icmp.c.
1183  *
1184  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1185  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1186  */
1187
1188 void ip_rt_send_redirect(struct sk_buff *skb)
1189 {
1190         struct rtable *rt = (struct rtable*)skb->dst;
1191         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1192
1193         if (!in_dev)
1194                 return;
1195
1196         if (!IN_DEV_TX_REDIRECTS(in_dev))
1197                 goto out;
1198
1199         /* No redirected packets during ip_rt_redirect_silence;
1200          * reset the algorithm.
1201          */
1202         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1203                 rt->u.dst.rate_tokens = 0;
1204
1205         /* Too many ignored redirects; do not send anything
1206          * set u.dst.rate_last to the last seen redirected packet.
1207          */
1208         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1209                 rt->u.dst.rate_last = jiffies;
1210                 goto out;
1211         }
1212
1213         /* Check for load limit; set rate_last to the latest sent
1214          * redirect.
1215          */
1216         if (rt->u.dst.rate_tokens == 0 ||
1217             time_after(jiffies,
1218                        (rt->u.dst.rate_last +
1219                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1220                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1221                 rt->u.dst.rate_last = jiffies;
1222                 ++rt->u.dst.rate_tokens;
1223 #ifdef CONFIG_IP_ROUTE_VERBOSE
1224                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1225                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1226                     net_ratelimit())
1227                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1228                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1229                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1230                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1231 #endif
1232         }
1233 out:
1234         in_dev_put(in_dev);
1235 }
1236
1237 static int ip_error(struct sk_buff *skb)
1238 {
1239         struct rtable *rt = (struct rtable*)skb->dst;
1240         unsigned long now;
1241         int code;
1242
1243         switch (rt->u.dst.error) {
1244                 case EINVAL:
1245                 default:
1246                         goto out;
1247                 case EHOSTUNREACH:
1248                         code = ICMP_HOST_UNREACH;
1249                         break;
1250                 case ENETUNREACH:
1251                         code = ICMP_NET_UNREACH;
1252                         break;
1253                 case EACCES:
1254                         code = ICMP_PKT_FILTERED;
1255                         break;
1256         }
1257
1258         now = jiffies;
1259         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1260         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1261                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1262         rt->u.dst.rate_last = now;
1263         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1264                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1265                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1266         }
1267
1268 out:    kfree_skb(skb);
1269         return 0;
1270 }
1271
1272 /*
1273  *      The last two values are not from the RFC but
1274  *      are needed for AMPRnet AX.25 paths.
1275  */
1276
1277 static const unsigned short mtu_plateau[] =
1278 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1279
1280 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1281 {
1282         int i;
1283
1284         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1285                 if (old_mtu > mtu_plateau[i])
1286                         return mtu_plateau[i];
1287         return 68;
1288 }
1289
1290 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1291 {
1292         int i;
1293         unsigned short old_mtu = ntohs(iph->tot_len);
1294         struct rtable *rth;
1295         __be32  skeys[2] = { iph->saddr, 0, };
1296         __be32  daddr = iph->daddr;
1297         unsigned short est_mtu = 0;
1298
1299         if (ipv4_config.no_pmtu_disc)
1300                 return 0;
1301
1302         for (i = 0; i < 2; i++) {
1303                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1304
1305                 rcu_read_lock();
1306                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1307                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1308                         if (rth->fl.fl4_dst == daddr &&
1309                             rth->fl.fl4_src == skeys[i] &&
1310                             rth->rt_dst  == daddr &&
1311                             rth->rt_src  == iph->saddr &&
1312                             rth->fl.iif == 0 &&
1313                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1314                                 unsigned short mtu = new_mtu;
1315
1316                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1317
1318                                         /* BSD 4.2 compatibility hack :-( */
1319                                         if (mtu == 0 &&
1320                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1321                                             old_mtu >= 68 + (iph->ihl << 2))
1322                                                 old_mtu -= iph->ihl << 2;
1323
1324                                         mtu = guess_mtu(old_mtu);
1325                                 }
1326                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1327                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1328                                                 dst_confirm(&rth->u.dst);
1329                                                 if (mtu < ip_rt_min_pmtu) {
1330                                                         mtu = ip_rt_min_pmtu;
1331                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1332                                                                 (1 << RTAX_MTU);
1333                                                 }
1334                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1335                                                 dst_set_expires(&rth->u.dst,
1336                                                         ip_rt_mtu_expires);
1337                                         }
1338                                         est_mtu = mtu;
1339                                 }
1340                         }
1341                 }
1342                 rcu_read_unlock();
1343         }
1344         return est_mtu ? : new_mtu;
1345 }
1346
1347 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1348 {
1349         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1350             !(dst_metric_locked(dst, RTAX_MTU))) {
1351                 if (mtu < ip_rt_min_pmtu) {
1352                         mtu = ip_rt_min_pmtu;
1353                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1354                 }
1355                 dst->metrics[RTAX_MTU-1] = mtu;
1356                 dst_set_expires(dst, ip_rt_mtu_expires);
1357                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1358         }
1359 }
1360
1361 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1362 {
1363         return NULL;
1364 }
1365
1366 static void ipv4_dst_destroy(struct dst_entry *dst)
1367 {
1368         struct rtable *rt = (struct rtable *) dst;
1369         struct inet_peer *peer = rt->peer;
1370         struct in_device *idev = rt->idev;
1371
1372         if (peer) {
1373                 rt->peer = NULL;
1374                 inet_putpeer(peer);
1375         }
1376
1377         if (idev) {
1378                 rt->idev = NULL;
1379                 in_dev_put(idev);
1380         }
1381 }
1382
1383 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1384                             int how)
1385 {
1386         struct rtable *rt = (struct rtable *) dst;
1387         struct in_device *idev = rt->idev;
1388         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1389                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1390                 if (loopback_idev) {
1391                         rt->idev = loopback_idev;
1392                         in_dev_put(idev);
1393                 }
1394         }
1395 }
1396
1397 static void ipv4_link_failure(struct sk_buff *skb)
1398 {
1399         struct rtable *rt;
1400
1401         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1402
1403         rt = (struct rtable *) skb->dst;
1404         if (rt)
1405                 dst_set_expires(&rt->u.dst, 0);
1406 }
1407
1408 static int ip_rt_bug(struct sk_buff *skb)
1409 {
1410         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1411                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1412                 skb->dev ? skb->dev->name : "?");
1413         kfree_skb(skb);
1414         return 0;
1415 }
1416
1417 /*
1418    We do not cache source address of outgoing interface,
1419    because it is used only by IP RR, TS and SRR options,
1420    so that it out of fast path.
1421
1422    BTW remember: "addr" is allowed to be not aligned
1423    in IP options!
1424  */
1425
1426 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1427 {
1428         __be32 src;
1429         struct fib_result res;
1430
1431         if (rt->fl.iif == 0)
1432                 src = rt->rt_src;
1433         else if (fib_lookup(&rt->fl, &res) == 0) {
1434                 src = FIB_RES_PREFSRC(res);
1435                 fib_res_put(&res);
1436         } else
1437                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1438                                         RT_SCOPE_UNIVERSE);
1439         memcpy(addr, &src, 4);
1440 }
1441
1442 #ifdef CONFIG_NET_CLS_ROUTE
1443 static void set_class_tag(struct rtable *rt, u32 tag)
1444 {
1445         if (!(rt->u.dst.tclassid & 0xFFFF))
1446                 rt->u.dst.tclassid |= tag & 0xFFFF;
1447         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1448                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1449 }
1450 #endif
1451
1452 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1453 {
1454         struct fib_info *fi = res->fi;
1455
1456         if (fi) {
1457                 if (FIB_RES_GW(*res) &&
1458                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1459                         rt->rt_gateway = FIB_RES_GW(*res);
1460                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1461                        sizeof(rt->u.dst.metrics));
1462                 if (fi->fib_mtu == 0) {
1463                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1464                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1465                             rt->rt_gateway != rt->rt_dst &&
1466                             rt->u.dst.dev->mtu > 576)
1467                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1468                 }
1469 #ifdef CONFIG_NET_CLS_ROUTE
1470                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1471 #endif
1472         } else
1473                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1474
1475         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1476                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1477         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1478                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1479         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1480                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1481                                        ip_rt_min_advmss);
1482         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1483                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1484
1485 #ifdef CONFIG_NET_CLS_ROUTE
1486 #ifdef CONFIG_IP_MULTIPLE_TABLES
1487         set_class_tag(rt, fib_rules_tclass(res));
1488 #endif
1489         set_class_tag(rt, itag);
1490 #endif
1491         rt->rt_type = res->type;
1492 }
1493
1494 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1495                                 u8 tos, struct net_device *dev, int our)
1496 {
1497         unsigned hash;
1498         struct rtable *rth;
1499         __be32 spec_dst;
1500         struct in_device *in_dev = in_dev_get(dev);
1501         u32 itag = 0;
1502
1503         /* Primary sanity checks. */
1504
1505         if (in_dev == NULL)
1506                 return -EINVAL;
1507
1508         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1509             skb->protocol != htons(ETH_P_IP))
1510                 goto e_inval;
1511
1512         if (ZERONET(saddr)) {
1513                 if (!LOCAL_MCAST(daddr))
1514                         goto e_inval;
1515                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1516         } else if (fib_validate_source(saddr, 0, tos, 0,
1517                                         dev, &spec_dst, &itag) < 0)
1518                 goto e_inval;
1519
1520         rth = dst_alloc(&ipv4_dst_ops);
1521         if (!rth)
1522                 goto e_nobufs;
1523
1524         rth->u.dst.output= ip_rt_bug;
1525
1526         atomic_set(&rth->u.dst.__refcnt, 1);
1527         rth->u.dst.flags= DST_HOST;
1528         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1529                 rth->u.dst.flags |= DST_NOPOLICY;
1530         rth->fl.fl4_dst = daddr;
1531         rth->rt_dst     = daddr;
1532         rth->fl.fl4_tos = tos;
1533         rth->fl.mark    = skb->mark;
1534         rth->fl.fl4_src = saddr;
1535         rth->rt_src     = saddr;
1536 #ifdef CONFIG_NET_CLS_ROUTE
1537         rth->u.dst.tclassid = itag;
1538 #endif
1539         rth->rt_iif     =
1540         rth->fl.iif     = dev->ifindex;
1541         rth->u.dst.dev  = init_net.loopback_dev;
1542         dev_hold(rth->u.dst.dev);
1543         rth->idev       = in_dev_get(rth->u.dst.dev);
1544         rth->fl.oif     = 0;
1545         rth->rt_gateway = daddr;
1546         rth->rt_spec_dst= spec_dst;
1547         rth->rt_type    = RTN_MULTICAST;
1548         rth->rt_flags   = RTCF_MULTICAST;
1549         if (our) {
1550                 rth->u.dst.input= ip_local_deliver;
1551                 rth->rt_flags |= RTCF_LOCAL;
1552         }
1553
1554 #ifdef CONFIG_IP_MROUTE
1555         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1556                 rth->u.dst.input = ip_mr_input;
1557 #endif
1558         RT_CACHE_STAT_INC(in_slow_mc);
1559
1560         in_dev_put(in_dev);
1561         hash = rt_hash(daddr, saddr, dev->ifindex);
1562         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1563
1564 e_nobufs:
1565         in_dev_put(in_dev);
1566         return -ENOBUFS;
1567
1568 e_inval:
1569         in_dev_put(in_dev);
1570         return -EINVAL;
1571 }
1572
1573
1574 static void ip_handle_martian_source(struct net_device *dev,
1575                                      struct in_device *in_dev,
1576                                      struct sk_buff *skb,
1577                                      __be32 daddr,
1578                                      __be32 saddr)
1579 {
1580         RT_CACHE_STAT_INC(in_martian_src);
1581 #ifdef CONFIG_IP_ROUTE_VERBOSE
1582         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1583                 /*
1584                  *      RFC1812 recommendation, if source is martian,
1585                  *      the only hint is MAC header.
1586                  */
1587                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1588                         "%u.%u.%u.%u, on dev %s\n",
1589                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1590                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1591                         int i;
1592                         const unsigned char *p = skb_mac_header(skb);
1593                         printk(KERN_WARNING "ll header: ");
1594                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1595                                 printk("%02x", *p);
1596                                 if (i < (dev->hard_header_len - 1))
1597                                         printk(":");
1598                         }
1599                         printk("\n");
1600                 }
1601         }
1602 #endif
1603 }
1604
1605 static inline int __mkroute_input(struct sk_buff *skb,
1606                                   struct fib_result* res,
1607                                   struct in_device *in_dev,
1608                                   __be32 daddr, __be32 saddr, u32 tos,
1609                                   struct rtable **result)
1610 {
1611
1612         struct rtable *rth;
1613         int err;
1614         struct in_device *out_dev;
1615         unsigned flags = 0;
1616         __be32 spec_dst;
1617         u32 itag;
1618
1619         /* get a working reference to the output device */
1620         out_dev = in_dev_get(FIB_RES_DEV(*res));
1621         if (out_dev == NULL) {
1622                 if (net_ratelimit())
1623                         printk(KERN_CRIT "Bug in ip_route_input" \
1624                                "_slow(). Please, report\n");
1625                 return -EINVAL;
1626         }
1627
1628
1629         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1630                                   in_dev->dev, &spec_dst, &itag);
1631         if (err < 0) {
1632                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1633                                          saddr);
1634
1635                 err = -EINVAL;
1636                 goto cleanup;
1637         }
1638
1639         if (err)
1640                 flags |= RTCF_DIRECTSRC;
1641
1642         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1643             (IN_DEV_SHARED_MEDIA(out_dev) ||
1644              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1645                 flags |= RTCF_DOREDIRECT;
1646
1647         if (skb->protocol != htons(ETH_P_IP)) {
1648                 /* Not IP (i.e. ARP). Do not create route, if it is
1649                  * invalid for proxy arp. DNAT routes are always valid.
1650                  */
1651                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1652                         err = -EINVAL;
1653                         goto cleanup;
1654                 }
1655         }
1656
1657
1658         rth = dst_alloc(&ipv4_dst_ops);
1659         if (!rth) {
1660                 err = -ENOBUFS;
1661                 goto cleanup;
1662         }
1663
1664         atomic_set(&rth->u.dst.__refcnt, 1);
1665         rth->u.dst.flags= DST_HOST;
1666         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1667                 rth->u.dst.flags |= DST_NOPOLICY;
1668         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1669                 rth->u.dst.flags |= DST_NOXFRM;
1670         rth->fl.fl4_dst = daddr;
1671         rth->rt_dst     = daddr;
1672         rth->fl.fl4_tos = tos;
1673         rth->fl.mark    = skb->mark;
1674         rth->fl.fl4_src = saddr;
1675         rth->rt_src     = saddr;
1676         rth->rt_gateway = daddr;
1677         rth->rt_iif     =
1678                 rth->fl.iif     = in_dev->dev->ifindex;
1679         rth->u.dst.dev  = (out_dev)->dev;
1680         dev_hold(rth->u.dst.dev);
1681         rth->idev       = in_dev_get(rth->u.dst.dev);
1682         rth->fl.oif     = 0;
1683         rth->rt_spec_dst= spec_dst;
1684
1685         rth->u.dst.input = ip_forward;
1686         rth->u.dst.output = ip_output;
1687
1688         rt_set_nexthop(rth, res, itag);
1689
1690         rth->rt_flags = flags;
1691
1692         *result = rth;
1693         err = 0;
1694  cleanup:
1695         /* release the working reference to the output device */
1696         in_dev_put(out_dev);
1697         return err;
1698 }
1699
1700 static inline int ip_mkroute_input(struct sk_buff *skb,
1701                                    struct fib_result* res,
1702                                    const struct flowi *fl,
1703                                    struct in_device *in_dev,
1704                                    __be32 daddr, __be32 saddr, u32 tos)
1705 {
1706         struct rtable* rth = NULL;
1707         int err;
1708         unsigned hash;
1709
1710 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1711         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1712                 fib_select_multipath(fl, res);
1713 #endif
1714
1715         /* create a routing cache entry */
1716         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1717         if (err)
1718                 return err;
1719
1720         /* put it into the cache */
1721         hash = rt_hash(daddr, saddr, fl->iif);
1722         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1723 }
1724
1725 /*
1726  *      NOTE. We drop all the packets that has local source
1727  *      addresses, because every properly looped back packet
1728  *      must have correct destination already attached by output routine.
1729  *
1730  *      Such approach solves two big problems:
1731  *      1. Not simplex devices are handled properly.
1732  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1733  */
1734
1735 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1736                                u8 tos, struct net_device *dev)
1737 {
1738         struct fib_result res;
1739         struct in_device *in_dev = in_dev_get(dev);
1740         struct flowi fl = { .nl_u = { .ip4_u =
1741                                       { .daddr = daddr,
1742                                         .saddr = saddr,
1743                                         .tos = tos,
1744                                         .scope = RT_SCOPE_UNIVERSE,
1745                                       } },
1746                             .mark = skb->mark,
1747                             .iif = dev->ifindex };
1748         unsigned        flags = 0;
1749         u32             itag = 0;
1750         struct rtable * rth;
1751         unsigned        hash;
1752         __be32          spec_dst;
1753         int             err = -EINVAL;
1754         int             free_res = 0;
1755
1756         /* IP on this device is disabled. */
1757
1758         if (!in_dev)
1759                 goto out;
1760
1761         /* Check for the most weird martians, which can be not detected
1762            by fib_lookup.
1763          */
1764
1765         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1766                 goto martian_source;
1767
1768         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1769                 goto brd_input;
1770
1771         /* Accept zero addresses only to limited broadcast;
1772          * I even do not know to fix it or not. Waiting for complains :-)
1773          */
1774         if (ZERONET(saddr))
1775                 goto martian_source;
1776
1777         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1778                 goto martian_destination;
1779
1780         /*
1781          *      Now we are ready to route packet.
1782          */
1783         if ((err = fib_lookup(&fl, &res)) != 0) {
1784                 if (!IN_DEV_FORWARD(in_dev))
1785                         goto e_hostunreach;
1786                 goto no_route;
1787         }
1788         free_res = 1;
1789
1790         RT_CACHE_STAT_INC(in_slow_tot);
1791
1792         if (res.type == RTN_BROADCAST)
1793                 goto brd_input;
1794
1795         if (res.type == RTN_LOCAL) {
1796                 int result;
1797                 result = fib_validate_source(saddr, daddr, tos,
1798                                              init_net.loopback_dev->ifindex,
1799                                              dev, &spec_dst, &itag);
1800                 if (result < 0)
1801                         goto martian_source;
1802                 if (result)
1803                         flags |= RTCF_DIRECTSRC;
1804                 spec_dst = daddr;
1805                 goto local_input;
1806         }
1807
1808         if (!IN_DEV_FORWARD(in_dev))
1809                 goto e_hostunreach;
1810         if (res.type != RTN_UNICAST)
1811                 goto martian_destination;
1812
1813         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1814 done:
1815         in_dev_put(in_dev);
1816         if (free_res)
1817                 fib_res_put(&res);
1818 out:    return err;
1819
1820 brd_input:
1821         if (skb->protocol != htons(ETH_P_IP))
1822                 goto e_inval;
1823
1824         if (ZERONET(saddr))
1825                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1826         else {
1827                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1828                                           &itag);
1829                 if (err < 0)
1830                         goto martian_source;
1831                 if (err)
1832                         flags |= RTCF_DIRECTSRC;
1833         }
1834         flags |= RTCF_BROADCAST;
1835         res.type = RTN_BROADCAST;
1836         RT_CACHE_STAT_INC(in_brd);
1837
1838 local_input:
1839         rth = dst_alloc(&ipv4_dst_ops);
1840         if (!rth)
1841                 goto e_nobufs;
1842
1843         rth->u.dst.output= ip_rt_bug;
1844
1845         atomic_set(&rth->u.dst.__refcnt, 1);
1846         rth->u.dst.flags= DST_HOST;
1847         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1848                 rth->u.dst.flags |= DST_NOPOLICY;
1849         rth->fl.fl4_dst = daddr;
1850         rth->rt_dst     = daddr;
1851         rth->fl.fl4_tos = tos;
1852         rth->fl.mark    = skb->mark;
1853         rth->fl.fl4_src = saddr;
1854         rth->rt_src     = saddr;
1855 #ifdef CONFIG_NET_CLS_ROUTE
1856         rth->u.dst.tclassid = itag;
1857 #endif
1858         rth->rt_iif     =
1859         rth->fl.iif     = dev->ifindex;
1860         rth->u.dst.dev  = init_net.loopback_dev;
1861         dev_hold(rth->u.dst.dev);
1862         rth->idev       = in_dev_get(rth->u.dst.dev);
1863         rth->rt_gateway = daddr;
1864         rth->rt_spec_dst= spec_dst;
1865         rth->u.dst.input= ip_local_deliver;
1866         rth->rt_flags   = flags|RTCF_LOCAL;
1867         if (res.type == RTN_UNREACHABLE) {
1868                 rth->u.dst.input= ip_error;
1869                 rth->u.dst.error= -err;
1870                 rth->rt_flags   &= ~RTCF_LOCAL;
1871         }
1872         rth->rt_type    = res.type;
1873         hash = rt_hash(daddr, saddr, fl.iif);
1874         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1875         goto done;
1876
1877 no_route:
1878         RT_CACHE_STAT_INC(in_no_route);
1879         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1880         res.type = RTN_UNREACHABLE;
1881         goto local_input;
1882
1883         /*
1884          *      Do not cache martian addresses: they should be logged (RFC1812)
1885          */
1886 martian_destination:
1887         RT_CACHE_STAT_INC(in_martian_dst);
1888 #ifdef CONFIG_IP_ROUTE_VERBOSE
1889         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1890                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1891                         "%u.%u.%u.%u, dev %s\n",
1892                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1893 #endif
1894
1895 e_hostunreach:
1896         err = -EHOSTUNREACH;
1897         goto done;
1898
1899 e_inval:
1900         err = -EINVAL;
1901         goto done;
1902
1903 e_nobufs:
1904         err = -ENOBUFS;
1905         goto done;
1906
1907 martian_source:
1908         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1909         goto e_inval;
1910 }
1911
1912 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1913                    u8 tos, struct net_device *dev)
1914 {
1915         struct rtable * rth;
1916         unsigned        hash;
1917         int iif = dev->ifindex;
1918
1919         tos &= IPTOS_RT_MASK;
1920         hash = rt_hash(daddr, saddr, iif);
1921
1922         rcu_read_lock();
1923         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1924              rth = rcu_dereference(rth->u.dst.rt_next)) {
1925                 if (rth->fl.fl4_dst == daddr &&
1926                     rth->fl.fl4_src == saddr &&
1927                     rth->fl.iif == iif &&
1928                     rth->fl.oif == 0 &&
1929                     rth->fl.mark == skb->mark &&
1930                     rth->fl.fl4_tos == tos) {
1931                         dst_use(&rth->u.dst, jiffies);
1932                         RT_CACHE_STAT_INC(in_hit);
1933                         rcu_read_unlock();
1934                         skb->dst = (struct dst_entry*)rth;
1935                         return 0;
1936                 }
1937                 RT_CACHE_STAT_INC(in_hlist_search);
1938         }
1939         rcu_read_unlock();
1940
1941         /* Multicast recognition logic is moved from route cache to here.
1942            The problem was that too many Ethernet cards have broken/missing
1943            hardware multicast filters :-( As result the host on multicasting
1944            network acquires a lot of useless route cache entries, sort of
1945            SDR messages from all the world. Now we try to get rid of them.
1946            Really, provided software IP multicast filter is organized
1947            reasonably (at least, hashed), it does not result in a slowdown
1948            comparing with route cache reject entries.
1949            Note, that multicast routers are not affected, because
1950            route cache entry is created eventually.
1951          */
1952         if (MULTICAST(daddr)) {
1953                 struct in_device *in_dev;
1954
1955                 rcu_read_lock();
1956                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1957                         int our = ip_check_mc(in_dev, daddr, saddr,
1958                                 ip_hdr(skb)->protocol);
1959                         if (our
1960 #ifdef CONFIG_IP_MROUTE
1961                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1962 #endif
1963                             ) {
1964                                 rcu_read_unlock();
1965                                 return ip_route_input_mc(skb, daddr, saddr,
1966                                                          tos, dev, our);
1967                         }
1968                 }
1969                 rcu_read_unlock();
1970                 return -EINVAL;
1971         }
1972         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1973 }
1974
1975 static inline int __mkroute_output(struct rtable **result,
1976                                    struct fib_result* res,
1977                                    const struct flowi *fl,
1978                                    const struct flowi *oldflp,
1979                                    struct net_device *dev_out,
1980                                    unsigned flags)
1981 {
1982         struct rtable *rth;
1983         struct in_device *in_dev;
1984         u32 tos = RT_FL_TOS(oldflp);
1985         int err = 0;
1986
1987         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1988                 return -EINVAL;
1989
1990         if (fl->fl4_dst == htonl(0xFFFFFFFF))
1991                 res->type = RTN_BROADCAST;
1992         else if (MULTICAST(fl->fl4_dst))
1993                 res->type = RTN_MULTICAST;
1994         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
1995                 return -EINVAL;
1996
1997         if (dev_out->flags & IFF_LOOPBACK)
1998                 flags |= RTCF_LOCAL;
1999
2000         /* get work reference to inet device */
2001         in_dev = in_dev_get(dev_out);
2002         if (!in_dev)
2003                 return -EINVAL;
2004
2005         if (res->type == RTN_BROADCAST) {
2006                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2007                 if (res->fi) {
2008                         fib_info_put(res->fi);
2009                         res->fi = NULL;
2010                 }
2011         } else if (res->type == RTN_MULTICAST) {
2012                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2013                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2014                                  oldflp->proto))
2015                         flags &= ~RTCF_LOCAL;
2016                 /* If multicast route do not exist use
2017                    default one, but do not gateway in this case.
2018                    Yes, it is hack.
2019                  */
2020                 if (res->fi && res->prefixlen < 4) {
2021                         fib_info_put(res->fi);
2022                         res->fi = NULL;
2023                 }
2024         }
2025
2026
2027         rth = dst_alloc(&ipv4_dst_ops);
2028         if (!rth) {
2029                 err = -ENOBUFS;
2030                 goto cleanup;
2031         }
2032
2033         atomic_set(&rth->u.dst.__refcnt, 1);
2034         rth->u.dst.flags= DST_HOST;
2035         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2036                 rth->u.dst.flags |= DST_NOXFRM;
2037         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2038                 rth->u.dst.flags |= DST_NOPOLICY;
2039
2040         rth->fl.fl4_dst = oldflp->fl4_dst;
2041         rth->fl.fl4_tos = tos;
2042         rth->fl.fl4_src = oldflp->fl4_src;
2043         rth->fl.oif     = oldflp->oif;
2044         rth->fl.mark    = oldflp->mark;
2045         rth->rt_dst     = fl->fl4_dst;
2046         rth->rt_src     = fl->fl4_src;
2047         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2048         /* get references to the devices that are to be hold by the routing
2049            cache entry */
2050         rth->u.dst.dev  = dev_out;
2051         dev_hold(dev_out);
2052         rth->idev       = in_dev_get(dev_out);
2053         rth->rt_gateway = fl->fl4_dst;
2054         rth->rt_spec_dst= fl->fl4_src;
2055
2056         rth->u.dst.output=ip_output;
2057
2058         RT_CACHE_STAT_INC(out_slow_tot);
2059
2060         if (flags & RTCF_LOCAL) {
2061                 rth->u.dst.input = ip_local_deliver;
2062                 rth->rt_spec_dst = fl->fl4_dst;
2063         }
2064         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2065                 rth->rt_spec_dst = fl->fl4_src;
2066                 if (flags & RTCF_LOCAL &&
2067                     !(dev_out->flags & IFF_LOOPBACK)) {
2068                         rth->u.dst.output = ip_mc_output;
2069                         RT_CACHE_STAT_INC(out_slow_mc);
2070                 }
2071 #ifdef CONFIG_IP_MROUTE
2072                 if (res->type == RTN_MULTICAST) {
2073                         if (IN_DEV_MFORWARD(in_dev) &&
2074                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2075                                 rth->u.dst.input = ip_mr_input;
2076                                 rth->u.dst.output = ip_mc_output;
2077                         }
2078                 }
2079 #endif
2080         }
2081
2082         rt_set_nexthop(rth, res, 0);
2083
2084         rth->rt_flags = flags;
2085
2086         *result = rth;
2087  cleanup:
2088         /* release work reference to inet device */
2089         in_dev_put(in_dev);
2090
2091         return err;
2092 }
2093
2094 static inline int ip_mkroute_output(struct rtable **rp,
2095                                     struct fib_result* res,
2096                                     const struct flowi *fl,
2097                                     const struct flowi *oldflp,
2098                                     struct net_device *dev_out,
2099                                     unsigned flags)
2100 {
2101         struct rtable *rth = NULL;
2102         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2103         unsigned hash;
2104         if (err == 0) {
2105                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2106                 err = rt_intern_hash(hash, rth, rp);
2107         }
2108
2109         return err;
2110 }
2111
2112 /*
2113  * Major route resolver routine.
2114  */
2115
2116 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2117 {
2118         u32 tos = RT_FL_TOS(oldflp);
2119         struct flowi fl = { .nl_u = { .ip4_u =
2120                                       { .daddr = oldflp->fl4_dst,
2121                                         .saddr = oldflp->fl4_src,
2122                                         .tos = tos & IPTOS_RT_MASK,
2123                                         .scope = ((tos & RTO_ONLINK) ?
2124                                                   RT_SCOPE_LINK :
2125                                                   RT_SCOPE_UNIVERSE),
2126                                       } },
2127                             .mark = oldflp->mark,
2128                             .iif = init_net.loopback_dev->ifindex,
2129                             .oif = oldflp->oif };
2130         struct fib_result res;
2131         unsigned flags = 0;
2132         struct net_device *dev_out = NULL;
2133         int free_res = 0;
2134         int err;
2135
2136
2137         res.fi          = NULL;
2138 #ifdef CONFIG_IP_MULTIPLE_TABLES
2139         res.r           = NULL;
2140 #endif
2141
2142         if (oldflp->fl4_src) {
2143                 err = -EINVAL;
2144                 if (MULTICAST(oldflp->fl4_src) ||
2145                     BADCLASS(oldflp->fl4_src) ||
2146                     ZERONET(oldflp->fl4_src))
2147                         goto out;
2148
2149                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2150                 dev_out = ip_dev_find(oldflp->fl4_src);
2151                 if (dev_out == NULL)
2152                         goto out;
2153
2154                 /* I removed check for oif == dev_out->oif here.
2155                    It was wrong for two reasons:
2156                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2157                       assigned to multiple interfaces.
2158                    2. Moreover, we are allowed to send packets with saddr
2159                       of another iface. --ANK
2160                  */
2161
2162                 if (oldflp->oif == 0
2163                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2164                         /* Special hack: user can direct multicasts
2165                            and limited broadcast via necessary interface
2166                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2167                            This hack is not just for fun, it allows
2168                            vic,vat and friends to work.
2169                            They bind socket to loopback, set ttl to zero
2170                            and expect that it will work.
2171                            From the viewpoint of routing cache they are broken,
2172                            because we are not allowed to build multicast path
2173                            with loopback source addr (look, routing cache
2174                            cannot know, that ttl is zero, so that packet
2175                            will not leave this host and route is valid).
2176                            Luckily, this hack is good workaround.
2177                          */
2178
2179                         fl.oif = dev_out->ifindex;
2180                         goto make_route;
2181                 }
2182                 if (dev_out)
2183                         dev_put(dev_out);
2184                 dev_out = NULL;
2185         }
2186
2187
2188         if (oldflp->oif) {
2189                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2190                 err = -ENODEV;
2191                 if (dev_out == NULL)
2192                         goto out;
2193
2194                 /* RACE: Check return value of inet_select_addr instead. */
2195                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2196                         dev_put(dev_out);
2197                         goto out;       /* Wrong error code */
2198                 }
2199
2200                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2201                         if (!fl.fl4_src)
2202                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2203                                                               RT_SCOPE_LINK);
2204                         goto make_route;
2205                 }
2206                 if (!fl.fl4_src) {
2207                         if (MULTICAST(oldflp->fl4_dst))
2208                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2209                                                               fl.fl4_scope);
2210                         else if (!oldflp->fl4_dst)
2211                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2212                                                               RT_SCOPE_HOST);
2213                 }
2214         }
2215
2216         if (!fl.fl4_dst) {
2217                 fl.fl4_dst = fl.fl4_src;
2218                 if (!fl.fl4_dst)
2219                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2220                 if (dev_out)
2221                         dev_put(dev_out);
2222                 dev_out = init_net.loopback_dev;
2223                 dev_hold(dev_out);
2224                 fl.oif = init_net.loopback_dev->ifindex;
2225                 res.type = RTN_LOCAL;
2226                 flags |= RTCF_LOCAL;
2227                 goto make_route;
2228         }
2229
2230         if (fib_lookup(&fl, &res)) {
2231                 res.fi = NULL;
2232                 if (oldflp->oif) {
2233                         /* Apparently, routing tables are wrong. Assume,
2234                            that the destination is on link.
2235
2236                            WHY? DW.
2237                            Because we are allowed to send to iface
2238                            even if it has NO routes and NO assigned
2239                            addresses. When oif is specified, routing
2240                            tables are looked up with only one purpose:
2241                            to catch if destination is gatewayed, rather than
2242                            direct. Moreover, if MSG_DONTROUTE is set,
2243                            we send packet, ignoring both routing tables
2244                            and ifaddr state. --ANK
2245
2246
2247                            We could make it even if oif is unknown,
2248                            likely IPv6, but we do not.
2249                          */
2250
2251                         if (fl.fl4_src == 0)
2252                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2253                                                               RT_SCOPE_LINK);
2254                         res.type = RTN_UNICAST;
2255                         goto make_route;
2256                 }
2257                 if (dev_out)
2258                         dev_put(dev_out);
2259                 err = -ENETUNREACH;
2260                 goto out;
2261         }
2262         free_res = 1;
2263
2264         if (res.type == RTN_LOCAL) {
2265                 if (!fl.fl4_src)
2266                         fl.fl4_src = fl.fl4_dst;
2267                 if (dev_out)
2268                         dev_put(dev_out);
2269                 dev_out = init_net.loopback_dev;
2270                 dev_hold(dev_out);
2271                 fl.oif = dev_out->ifindex;
2272                 if (res.fi)
2273                         fib_info_put(res.fi);
2274                 res.fi = NULL;
2275                 flags |= RTCF_LOCAL;
2276                 goto make_route;
2277         }
2278
2279 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2280         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2281                 fib_select_multipath(&fl, &res);
2282         else
2283 #endif
2284         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2285                 fib_select_default(&fl, &res);
2286
2287         if (!fl.fl4_src)
2288                 fl.fl4_src = FIB_RES_PREFSRC(res);
2289
2290         if (dev_out)
2291                 dev_put(dev_out);
2292         dev_out = FIB_RES_DEV(res);
2293         dev_hold(dev_out);
2294         fl.oif = dev_out->ifindex;
2295
2296
2297 make_route:
2298         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2299
2300
2301         if (free_res)
2302                 fib_res_put(&res);
2303         if (dev_out)
2304                 dev_put(dev_out);
2305 out:    return err;
2306 }
2307
2308 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2309 {
2310         unsigned hash;
2311         struct rtable *rth;
2312
2313         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2314
2315         rcu_read_lock_bh();
2316         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2317                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2318                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2319                     rth->fl.fl4_src == flp->fl4_src &&
2320                     rth->fl.iif == 0 &&
2321                     rth->fl.oif == flp->oif &&
2322                     rth->fl.mark == flp->mark &&
2323                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2324                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2325                         dst_use(&rth->u.dst, jiffies);
2326                         RT_CACHE_STAT_INC(out_hit);
2327                         rcu_read_unlock_bh();
2328                         *rp = rth;
2329                         return 0;
2330                 }
2331                 RT_CACHE_STAT_INC(out_hlist_search);
2332         }
2333         rcu_read_unlock_bh();
2334
2335         return ip_route_output_slow(rp, flp);
2336 }
2337
2338 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2339
2340 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2341 {
2342 }
2343
2344 static struct dst_ops ipv4_dst_blackhole_ops = {
2345         .family                 =       AF_INET,
2346         .protocol               =       __constant_htons(ETH_P_IP),
2347         .destroy                =       ipv4_dst_destroy,
2348         .check                  =       ipv4_dst_check,
2349         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2350         .entry_size             =       sizeof(struct rtable),
2351 };
2352
2353
2354 static int ipv4_blackhole_output(struct sk_buff *skb)
2355 {
2356         kfree_skb(skb);
2357         return 0;
2358 }
2359
2360 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2361 {
2362         struct rtable *ort = *rp;
2363         struct rtable *rt = (struct rtable *)
2364                 dst_alloc(&ipv4_dst_blackhole_ops);
2365
2366         if (rt) {
2367                 struct dst_entry *new = &rt->u.dst;
2368
2369                 atomic_set(&new->__refcnt, 1);
2370                 new->__use = 1;
2371                 new->input = ipv4_blackhole_output;
2372                 new->output = ipv4_blackhole_output;
2373                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2374
2375                 new->dev = ort->u.dst.dev;
2376                 if (new->dev)
2377                         dev_hold(new->dev);
2378
2379                 rt->fl = ort->fl;
2380
2381                 rt->idev = ort->idev;
2382                 if (rt->idev)
2383                         in_dev_hold(rt->idev);
2384                 rt->rt_flags = ort->rt_flags;
2385                 rt->rt_type = ort->rt_type;
2386                 rt->rt_dst = ort->rt_dst;
2387                 rt->rt_src = ort->rt_src;
2388                 rt->rt_iif = ort->rt_iif;
2389                 rt->rt_gateway = ort->rt_gateway;
2390                 rt->rt_spec_dst = ort->rt_spec_dst;
2391                 rt->peer = ort->peer;
2392                 if (rt->peer)
2393                         atomic_inc(&rt->peer->refcnt);
2394
2395                 dst_free(new);
2396         }
2397
2398         dst_release(&(*rp)->u.dst);
2399         *rp = rt;
2400         return (rt ? 0 : -ENOMEM);
2401 }
2402
2403 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2404 {
2405         int err;
2406
2407         if ((err = __ip_route_output_key(rp, flp)) != 0)
2408                 return err;
2409
2410         if (flp->proto) {
2411                 if (!flp->fl4_src)
2412                         flp->fl4_src = (*rp)->rt_src;
2413                 if (!flp->fl4_dst)
2414                         flp->fl4_dst = (*rp)->rt_dst;
2415                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2416                 if (err == -EREMOTE)
2417                         err = ipv4_dst_blackhole(rp, flp, sk);
2418
2419                 return err;
2420         }
2421
2422         return 0;
2423 }
2424
2425 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2426
2427 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2428 {
2429         return ip_route_output_flow(rp, flp, NULL, 0);
2430 }
2431
2432 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2433                         int nowait, unsigned int flags)
2434 {
2435         struct rtable *rt = (struct rtable*)skb->dst;
2436         struct rtmsg *r;
2437         struct nlmsghdr *nlh;
2438         long expires;
2439         u32 id = 0, ts = 0, tsage = 0, error;
2440
2441         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2442         if (nlh == NULL)
2443                 return -EMSGSIZE;
2444
2445         r = nlmsg_data(nlh);
2446         r->rtm_family    = AF_INET;
2447         r->rtm_dst_len  = 32;
2448         r->rtm_src_len  = 0;
2449         r->rtm_tos      = rt->fl.fl4_tos;
2450         r->rtm_table    = RT_TABLE_MAIN;
2451         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2452         r->rtm_type     = rt->rt_type;
2453         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2454         r->rtm_protocol = RTPROT_UNSPEC;
2455         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2456         if (rt->rt_flags & RTCF_NOTIFY)
2457                 r->rtm_flags |= RTM_F_NOTIFY;
2458
2459         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2460
2461         if (rt->fl.fl4_src) {
2462                 r->rtm_src_len = 32;
2463                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2464         }
2465         if (rt->u.dst.dev)
2466                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2467 #ifdef CONFIG_NET_CLS_ROUTE
2468         if (rt->u.dst.tclassid)
2469                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2470 #endif
2471         if (rt->fl.iif)
2472                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2473         else if (rt->rt_src != rt->fl.fl4_src)
2474                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2475
2476         if (rt->rt_dst != rt->rt_gateway)
2477                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2478
2479         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2480                 goto nla_put_failure;
2481
2482         error = rt->u.dst.error;
2483         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2484         if (rt->peer) {
2485                 id = rt->peer->ip_id_count;
2486                 if (rt->peer->tcp_ts_stamp) {
2487                         ts = rt->peer->tcp_ts;
2488                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2489                 }
2490         }
2491
2492         if (rt->fl.iif) {
2493 #ifdef CONFIG_IP_MROUTE
2494                 __be32 dst = rt->rt_dst;
2495
2496                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2497                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2498                         int err = ipmr_get_route(skb, r, nowait);
2499                         if (err <= 0) {
2500                                 if (!nowait) {
2501                                         if (err == 0)
2502                                                 return 0;
2503                                         goto nla_put_failure;
2504                                 } else {
2505                                         if (err == -EMSGSIZE)
2506                                                 goto nla_put_failure;
2507                                         error = err;
2508                                 }
2509                         }
2510                 } else
2511 #endif
2512                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2513         }
2514
2515         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2516                                expires, error) < 0)
2517                 goto nla_put_failure;
2518
2519         return nlmsg_end(skb, nlh);
2520
2521 nla_put_failure:
2522         nlmsg_cancel(skb, nlh);
2523         return -EMSGSIZE;
2524 }
2525
2526 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2527 {
2528         struct rtmsg *rtm;
2529         struct nlattr *tb[RTA_MAX+1];
2530         struct rtable *rt = NULL;
2531         __be32 dst = 0;
2532         __be32 src = 0;
2533         u32 iif;
2534         int err;
2535         struct sk_buff *skb;
2536
2537         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2538         if (err < 0)
2539                 goto errout;
2540
2541         rtm = nlmsg_data(nlh);
2542
2543         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2544         if (skb == NULL) {
2545                 err = -ENOBUFS;
2546                 goto errout;
2547         }
2548
2549         /* Reserve room for dummy headers, this skb can pass
2550            through good chunk of routing engine.
2551          */
2552         skb_reset_mac_header(skb);
2553         skb_reset_network_header(skb);
2554
2555         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2556         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2557         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2558
2559         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2560         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2561         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2562
2563         if (iif) {
2564                 struct net_device *dev;
2565
2566                 dev = __dev_get_by_index(&init_net, iif);
2567                 if (dev == NULL) {
2568                         err = -ENODEV;
2569                         goto errout_free;
2570                 }
2571
2572                 skb->protocol   = htons(ETH_P_IP);
2573                 skb->dev        = dev;
2574                 local_bh_disable();
2575                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2576                 local_bh_enable();
2577
2578                 rt = (struct rtable*) skb->dst;
2579                 if (err == 0 && rt->u.dst.error)
2580                         err = -rt->u.dst.error;
2581         } else {
2582                 struct flowi fl = {
2583                         .nl_u = {
2584                                 .ip4_u = {
2585                                         .daddr = dst,
2586                                         .saddr = src,
2587                                         .tos = rtm->rtm_tos,
2588                                 },
2589                         },
2590                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2591                 };
2592                 err = ip_route_output_key(&rt, &fl);
2593         }
2594
2595         if (err)
2596                 goto errout_free;
2597
2598         skb->dst = &rt->u.dst;
2599         if (rtm->rtm_flags & RTM_F_NOTIFY)
2600                 rt->rt_flags |= RTCF_NOTIFY;
2601
2602         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2603                                 RTM_NEWROUTE, 0, 0);
2604         if (err <= 0)
2605                 goto errout_free;
2606
2607         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2608 errout:
2609         return err;
2610
2611 errout_free:
2612         kfree_skb(skb);
2613         goto errout;
2614 }
2615
2616 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2617 {
2618         struct rtable *rt;
2619         int h, s_h;
2620         int idx, s_idx;
2621
2622         s_h = cb->args[0];
2623         s_idx = idx = cb->args[1];
2624         for (h = 0; h <= rt_hash_mask; h++) {
2625                 if (h < s_h) continue;
2626                 if (h > s_h)
2627                         s_idx = 0;
2628                 rcu_read_lock_bh();
2629                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2630                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2631                         if (idx < s_idx)
2632                                 continue;
2633                         skb->dst = dst_clone(&rt->u.dst);
2634                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2635                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2636                                          1, NLM_F_MULTI) <= 0) {
2637                                 dst_release(xchg(&skb->dst, NULL));
2638                                 rcu_read_unlock_bh();
2639                                 goto done;
2640                         }
2641                         dst_release(xchg(&skb->dst, NULL));
2642                 }
2643                 rcu_read_unlock_bh();
2644         }
2645
2646 done:
2647         cb->args[0] = h;
2648         cb->args[1] = idx;
2649         return skb->len;
2650 }
2651
2652 void ip_rt_multicast_event(struct in_device *in_dev)
2653 {
2654         rt_cache_flush(0);
2655 }
2656
2657 #ifdef CONFIG_SYSCTL
2658 static int flush_delay;
2659
2660 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2661                                         struct file *filp, void __user *buffer,
2662                                         size_t *lenp, loff_t *ppos)
2663 {
2664         if (write) {
2665                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2666                 rt_cache_flush(flush_delay);
2667                 return 0;
2668         }
2669
2670         return -EINVAL;
2671 }
2672
2673 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2674                                                 int __user *name,
2675                                                 int nlen,
2676                                                 void __user *oldval,
2677                                                 size_t __user *oldlenp,
2678                                                 void __user *newval,
2679                                                 size_t newlen)
2680 {
2681         int delay;
2682         if (newlen != sizeof(int))
2683                 return -EINVAL;
2684         if (get_user(delay, (int __user *)newval))
2685                 return -EFAULT;
2686         rt_cache_flush(delay);
2687         return 0;
2688 }
2689
2690 ctl_table ipv4_route_table[] = {
2691         {
2692                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2693                 .procname       = "flush",
2694                 .data           = &flush_delay,
2695                 .maxlen         = sizeof(int),
2696                 .mode           = 0200,
2697                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2698                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2699         },
2700         {
2701                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2702                 .procname       = "min_delay",
2703                 .data           = &ip_rt_min_delay,
2704                 .maxlen         = sizeof(int),
2705                 .mode           = 0644,
2706                 .proc_handler   = &proc_dointvec_jiffies,
2707                 .strategy       = &sysctl_jiffies,
2708         },
2709         {
2710                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2711                 .procname       = "max_delay",
2712                 .data           = &ip_rt_max_delay,
2713                 .maxlen         = sizeof(int),
2714                 .mode           = 0644,
2715                 .proc_handler   = &proc_dointvec_jiffies,
2716                 .strategy       = &sysctl_jiffies,
2717         },
2718         {
2719                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2720                 .procname       = "gc_thresh",
2721                 .data           = &ipv4_dst_ops.gc_thresh,
2722                 .maxlen         = sizeof(int),
2723                 .mode           = 0644,
2724                 .proc_handler   = &proc_dointvec,
2725         },
2726         {
2727                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2728                 .procname       = "max_size",
2729                 .data           = &ip_rt_max_size,
2730                 .maxlen         = sizeof(int),
2731                 .mode           = 0644,
2732                 .proc_handler   = &proc_dointvec,
2733         },
2734         {
2735                 /*  Deprecated. Use gc_min_interval_ms */
2736
2737                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2738                 .procname       = "gc_min_interval",
2739                 .data           = &ip_rt_gc_min_interval,
2740                 .maxlen         = sizeof(int),
2741                 .mode           = 0644,
2742                 .proc_handler   = &proc_dointvec_jiffies,
2743                 .strategy       = &sysctl_jiffies,
2744         },
2745         {
2746                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2747                 .procname       = "gc_min_interval_ms",
2748                 .data           = &ip_rt_gc_min_interval,
2749                 .maxlen         = sizeof(int),
2750                 .mode           = 0644,
2751                 .proc_handler   = &proc_dointvec_ms_jiffies,
2752                 .strategy       = &sysctl_ms_jiffies,
2753         },
2754         {
2755                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2756                 .procname       = "gc_timeout",
2757                 .data           = &ip_rt_gc_timeout,
2758                 .maxlen         = sizeof(int),
2759                 .mode           = 0644,
2760                 .proc_handler   = &proc_dointvec_jiffies,
2761                 .strategy       = &sysctl_jiffies,
2762         },
2763         {
2764                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2765                 .procname       = "gc_interval",
2766                 .data           = &ip_rt_gc_interval,
2767                 .maxlen         = sizeof(int),
2768                 .mode           = 0644,
2769                 .proc_handler   = &proc_dointvec_jiffies,
2770                 .strategy       = &sysctl_jiffies,
2771         },
2772         {
2773                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2774                 .procname       = "redirect_load",
2775                 .data           = &ip_rt_redirect_load,
2776                 .maxlen         = sizeof(int),
2777                 .mode           = 0644,
2778                 .proc_handler   = &proc_dointvec,
2779         },
2780         {
2781                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2782                 .procname       = "redirect_number",
2783                 .data           = &ip_rt_redirect_number,
2784                 .maxlen         = sizeof(int),
2785                 .mode           = 0644,
2786                 .proc_handler   = &proc_dointvec,
2787         },
2788         {
2789                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2790                 .procname       = "redirect_silence",
2791                 .data           = &ip_rt_redirect_silence,
2792                 .maxlen         = sizeof(int),
2793                 .mode           = 0644,
2794                 .proc_handler   = &proc_dointvec,
2795         },
2796         {
2797                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2798                 .procname       = "error_cost",
2799                 .data           = &ip_rt_error_cost,
2800                 .maxlen         = sizeof(int),
2801                 .mode           = 0644,
2802                 .proc_handler   = &proc_dointvec,
2803         },
2804         {
2805                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2806                 .procname       = "error_burst",
2807                 .data           = &ip_rt_error_burst,
2808                 .maxlen         = sizeof(int),
2809                 .mode           = 0644,
2810                 .proc_handler   = &proc_dointvec,
2811         },
2812         {
2813                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2814                 .procname       = "gc_elasticity",
2815                 .data           = &ip_rt_gc_elasticity,
2816                 .maxlen         = sizeof(int),
2817                 .mode           = 0644,
2818                 .proc_handler   = &proc_dointvec,
2819         },
2820         {
2821                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2822                 .procname       = "mtu_expires",
2823                 .data           = &ip_rt_mtu_expires,
2824                 .maxlen         = sizeof(int),
2825                 .mode           = 0644,
2826                 .proc_handler   = &proc_dointvec_jiffies,
2827                 .strategy       = &sysctl_jiffies,
2828         },
2829         {
2830                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2831                 .procname       = "min_pmtu",
2832                 .data           = &ip_rt_min_pmtu,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = &proc_dointvec,
2836         },
2837         {
2838                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2839                 .procname       = "min_adv_mss",
2840                 .data           = &ip_rt_min_advmss,
2841                 .maxlen         = sizeof(int),
2842                 .mode           = 0644,
2843                 .proc_handler   = &proc_dointvec,
2844         },
2845         {
2846                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2847                 .procname       = "secret_interval",
2848                 .data           = &ip_rt_secret_interval,
2849                 .maxlen         = sizeof(int),
2850                 .mode           = 0644,
2851                 .proc_handler   = &proc_dointvec_jiffies,
2852                 .strategy       = &sysctl_jiffies,
2853         },
2854         { .ctl_name = 0 }
2855 };
2856 #endif
2857
2858 #ifdef CONFIG_NET_CLS_ROUTE
2859 struct ip_rt_acct *ip_rt_acct;
2860
2861 /* This code sucks.  But you should have seen it before! --RR */
2862
2863 /* IP route accounting ptr for this logical cpu number. */
2864 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2865
2866 #ifdef CONFIG_PROC_FS
2867 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2868                            int length, int *eof, void *data)
2869 {
2870         unsigned int i;
2871
2872         if ((offset & 3) || (length & 3))
2873                 return -EIO;
2874
2875         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2876                 *eof = 1;
2877                 return 0;
2878         }
2879
2880         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2881                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2882                 *eof = 1;
2883         }
2884
2885         offset /= sizeof(u32);
2886
2887         if (length > 0) {
2888                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2889                 u32 *dst = (u32 *) buffer;
2890
2891                 /* Copy first cpu. */
2892                 *start = buffer;
2893                 memcpy(dst, src, length);
2894
2895                 /* Add the other cpus in, one int at a time */
2896                 for_each_possible_cpu(i) {
2897                         unsigned int j;
2898
2899                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2900
2901                         for (j = 0; j < length/4; j++)
2902                                 dst[j] += src[j];
2903                 }
2904         }
2905         return length;
2906 }
2907 #endif /* CONFIG_PROC_FS */
2908 #endif /* CONFIG_NET_CLS_ROUTE */
2909
2910 static __initdata unsigned long rhash_entries;
2911 static int __init set_rhash_entries(char *str)
2912 {
2913         if (!str)
2914                 return 0;
2915         rhash_entries = simple_strtoul(str, &str, 0);
2916         return 1;
2917 }
2918 __setup("rhash_entries=", set_rhash_entries);
2919
2920 int __init ip_rt_init(void)
2921 {
2922         int rc = 0;
2923
2924         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2925                              (jiffies ^ (jiffies >> 7)));
2926
2927 #ifdef CONFIG_NET_CLS_ROUTE
2928         {
2929         int order;
2930         for (order = 0;
2931              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2932                 /* NOTHING */;
2933         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2934         if (!ip_rt_acct)
2935                 panic("IP: failed to allocate ip_rt_acct\n");
2936         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2937         }
2938 #endif
2939
2940         ipv4_dst_ops.kmem_cachep =
2941                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2942                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2943
2944         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2945
2946         rt_hash_table = (struct rt_hash_bucket *)
2947                 alloc_large_system_hash("IP route cache",
2948                                         sizeof(struct rt_hash_bucket),
2949                                         rhash_entries,
2950                                         (num_physpages >= 128 * 1024) ?
2951                                         15 : 17,
2952                                         0,
2953                                         &rt_hash_log,
2954                                         &rt_hash_mask,
2955                                         0);
2956         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2957         rt_hash_lock_init();
2958
2959         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2960         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2961
2962         devinet_init();
2963         ip_fib_init();
2964
2965         init_timer(&rt_flush_timer);
2966         rt_flush_timer.function = rt_run_flush;
2967         init_timer(&rt_secret_timer);
2968         rt_secret_timer.function = rt_secret_rebuild;
2969
2970         /* All the timers, started at system startup tend
2971            to synchronize. Perturb it a bit.
2972          */
2973         schedule_delayed_work(&expires_work,
2974                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2975
2976         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2977                 ip_rt_secret_interval;
2978         add_timer(&rt_secret_timer);
2979
2980 #ifdef CONFIG_PROC_FS
2981         {
2982         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2983         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2984             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2985                                              init_net.proc_net_stat))) {
2986                 return -ENOMEM;
2987         }
2988         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2989         }
2990 #ifdef CONFIG_NET_CLS_ROUTE
2991         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
2992 #endif
2993 #endif
2994 #ifdef CONFIG_XFRM
2995         xfrm_init();
2996         xfrm4_init();
2997 #endif
2998         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
2999
3000         return rc;
3001 }
3002
3003 EXPORT_SYMBOL(__ip_select_ident);
3004 EXPORT_SYMBOL(ip_route_input);
3005 EXPORT_SYMBOL(ip_route_output_key);