]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv4/ip_gre.c
fd192d676955fde161200bc37c4bc07650292a00
[linux-2.6-omap-h63xx.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
50
51 /*
52    Problems & solutions
53    --------------------
54
55    1. The most important issue is detecting local dead loops.
56    They would cause complete host lockup in transmit, which
57    would be "resolved" by stack overflow or, if queueing is enabled,
58    with infinite looping in net_bh.
59
60    We cannot track such dead loops during route installation,
61    it is infeasible task. The most general solutions would be
62    to keep skb->encapsulation counter (sort of local ttl),
63    and silently drop packet when it expires. It is the best
64    solution, but it supposes maintaing new variable in ALL
65    skb, even if no tunneling is used.
66
67    Current solution: t->recursion lock breaks dead loops. It looks
68    like dev->tbusy flag, but I preferred new variable, because
69    the semantics is different. One day, when hard_start_xmit
70    will be multithreaded we will have to use skb->encapsulation.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122
123 /* Fallback tunnel: no source, no destination, no key, no options */
124
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
126
127 #define HASH_SIZE  16
128
129 static int ipgre_net_id;
130 struct ipgre_net {
131         struct ip_tunnel *tunnels[4][HASH_SIZE];
132
133         struct net_device *fb_tunnel_dev;
134 };
135
136 /* Tunnel hash table */
137
138 /*
139    4 hash tables:
140
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155
156 #define tunnels_r_l     tunnels[3]
157 #define tunnels_r       tunnels[2]
158 #define tunnels_l       tunnels[1]
159 #define tunnels_wc      tunnels[0]
160
161 static DEFINE_RWLOCK(ipgre_lock);
162
163 /* Given src, dst and key, find appropriate for input tunnel. */
164
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166                 __be32 remote, __be32 local, __be32 key)
167 {
168         unsigned h0 = HASH(remote);
169         unsigned h1 = HASH(key);
170         struct ip_tunnel *t;
171         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
172
173         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
176                                 return t;
177                 }
178         }
179         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180                 if (remote == t->parms.iph.daddr) {
181                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182                                 return t;
183                 }
184         }
185         for (t = ign->tunnels_l[h1]; t; t = t->next) {
186                 if (local == t->parms.iph.saddr ||
187                      (local == t->parms.iph.daddr &&
188                       ipv4_is_multicast(local))) {
189                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190                                 return t;
191                 }
192         }
193         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194                 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
195                         return t;
196         }
197
198         if (ign->fb_tunnel_dev->flags&IFF_UP)
199                 return netdev_priv(ign->fb_tunnel_dev);
200         return NULL;
201 }
202
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204                 struct ip_tunnel_parm *parms)
205 {
206         __be32 remote = parms->iph.daddr;
207         __be32 local = parms->iph.saddr;
208         __be32 key = parms->i_key;
209         unsigned h = HASH(key);
210         int prio = 0;
211
212         if (local)
213                 prio |= 1;
214         if (remote && !ipv4_is_multicast(remote)) {
215                 prio |= 2;
216                 h ^= HASH(remote);
217         }
218
219         return &ign->tunnels[prio][h];
220 }
221
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
223                 struct ip_tunnel *t)
224 {
225         return __ipgre_bucket(ign, &t->parms);
226 }
227
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
229 {
230         struct ip_tunnel **tp = ipgre_bucket(ign, t);
231
232         t->next = *tp;
233         write_lock_bh(&ipgre_lock);
234         *tp = t;
235         write_unlock_bh(&ipgre_lock);
236 }
237
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
239 {
240         struct ip_tunnel **tp;
241
242         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243                 if (t == *tp) {
244                         write_lock_bh(&ipgre_lock);
245                         *tp = t->next;
246                         write_unlock_bh(&ipgre_lock);
247                         break;
248                 }
249         }
250 }
251
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253                 struct ip_tunnel_parm *parms, int create)
254 {
255         __be32 remote = parms->iph.daddr;
256         __be32 local = parms->iph.saddr;
257         __be32 key = parms->i_key;
258         struct ip_tunnel *t, **tp, *nt;
259         struct net_device *dev;
260         char name[IFNAMSIZ];
261         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
262
263         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265                         if (key == t->parms.i_key)
266                                 return t;
267                 }
268         }
269         if (!create)
270                 return NULL;
271
272         if (parms->name[0])
273                 strlcpy(name, parms->name, IFNAMSIZ);
274         else
275                 sprintf(name, "gre%%d");
276
277         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
278         if (!dev)
279           return NULL;
280
281         dev_net_set(dev, net);
282
283         if (strchr(name, '%')) {
284                 if (dev_alloc_name(dev, name) < 0)
285                         goto failed_free;
286         }
287
288         dev->init = ipgre_tunnel_init;
289         nt = netdev_priv(dev);
290         nt->parms = *parms;
291
292         if (register_netdevice(dev) < 0)
293                 goto failed_free;
294
295         dev_hold(dev);
296         ipgre_tunnel_link(ign, nt);
297         return nt;
298
299 failed_free:
300         free_netdev(dev);
301         return NULL;
302 }
303
304 static void ipgre_tunnel_uninit(struct net_device *dev)
305 {
306         struct net *net = dev_net(dev);
307         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
308
309         ipgre_tunnel_unlink(ign, netdev_priv(dev));
310         dev_put(dev);
311 }
312
313
314 static void ipgre_err(struct sk_buff *skb, u32 info)
315 {
316
317 /* All the routers (except for Linux) return only
318    8 bytes of packet payload. It means, that precise relaying of
319    ICMP in the real Internet is absolutely infeasible.
320
321    Moreover, Cisco "wise men" put GRE key to the third word
322    in GRE header. It makes impossible maintaining even soft state for keyed
323    GRE tunnels with enabled checksum. Tell them "thank you".
324
325    Well, I wonder, rfc1812 was written by Cisco employee,
326    what the hell these idiots break standrads established
327    by themself???
328  */
329
330         struct iphdr *iph = (struct iphdr*)skb->data;
331         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
332         int grehlen = (iph->ihl<<2) + 4;
333         const int type = icmp_hdr(skb)->type;
334         const int code = icmp_hdr(skb)->code;
335         struct ip_tunnel *t;
336         __be16 flags;
337
338         flags = p[0];
339         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
340                 if (flags&(GRE_VERSION|GRE_ROUTING))
341                         return;
342                 if (flags&GRE_KEY) {
343                         grehlen += 4;
344                         if (flags&GRE_CSUM)
345                                 grehlen += 4;
346                 }
347         }
348
349         /* If only 8 bytes returned, keyed message will be dropped here */
350         if (skb_headlen(skb) < grehlen)
351                 return;
352
353         switch (type) {
354         default:
355         case ICMP_PARAMETERPROB:
356                 return;
357
358         case ICMP_DEST_UNREACH:
359                 switch (code) {
360                 case ICMP_SR_FAILED:
361                 case ICMP_PORT_UNREACH:
362                         /* Impossible event. */
363                         return;
364                 case ICMP_FRAG_NEEDED:
365                         /* Soft state for pmtu is maintained by IP core. */
366                         return;
367                 default:
368                         /* All others are translated to HOST_UNREACH.
369                            rfc2003 contains "deep thoughts" about NET_UNREACH,
370                            I believe they are just ether pollution. --ANK
371                          */
372                         break;
373                 }
374                 break;
375         case ICMP_TIME_EXCEEDED:
376                 if (code != ICMP_EXC_TTL)
377                         return;
378                 break;
379         }
380
381         read_lock(&ipgre_lock);
382         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383                         (flags&GRE_KEY) ?
384                         *(((__be32*)p) + (grehlen>>2) - 1) : 0);
385         if (t == NULL || t->parms.iph.daddr == 0 ||
386             ipv4_is_multicast(t->parms.iph.daddr))
387                 goto out;
388
389         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390                 goto out;
391
392         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393                 t->err_count++;
394         else
395                 t->err_count = 1;
396         t->err_time = jiffies;
397 out:
398         read_unlock(&ipgre_lock);
399         return;
400 }
401
402 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
403 {
404         if (INET_ECN_is_ce(iph->tos)) {
405                 if (skb->protocol == htons(ETH_P_IP)) {
406                         IP_ECN_set_ce(ip_hdr(skb));
407                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
408                         IP6_ECN_set_ce(ipv6_hdr(skb));
409                 }
410         }
411 }
412
413 static inline u8
414 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
415 {
416         u8 inner = 0;
417         if (skb->protocol == htons(ETH_P_IP))
418                 inner = old_iph->tos;
419         else if (skb->protocol == htons(ETH_P_IPV6))
420                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
421         return INET_ECN_encapsulate(tos, inner);
422 }
423
424 static int ipgre_rcv(struct sk_buff *skb)
425 {
426         struct iphdr *iph;
427         u8     *h;
428         __be16    flags;
429         __sum16   csum = 0;
430         __be32 key = 0;
431         u32    seqno = 0;
432         struct ip_tunnel *tunnel;
433         int    offset = 4;
434
435         if (!pskb_may_pull(skb, 16))
436                 goto drop_nolock;
437
438         iph = ip_hdr(skb);
439         h = skb->data;
440         flags = *(__be16*)h;
441
442         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
443                 /* - Version must be 0.
444                    - We do not support routing headers.
445                  */
446                 if (flags&(GRE_VERSION|GRE_ROUTING))
447                         goto drop_nolock;
448
449                 if (flags&GRE_CSUM) {
450                         switch (skb->ip_summed) {
451                         case CHECKSUM_COMPLETE:
452                                 csum = csum_fold(skb->csum);
453                                 if (!csum)
454                                         break;
455                                 /* fall through */
456                         case CHECKSUM_NONE:
457                                 skb->csum = 0;
458                                 csum = __skb_checksum_complete(skb);
459                                 skb->ip_summed = CHECKSUM_COMPLETE;
460                         }
461                         offset += 4;
462                 }
463                 if (flags&GRE_KEY) {
464                         key = *(__be32*)(h + offset);
465                         offset += 4;
466                 }
467                 if (flags&GRE_SEQ) {
468                         seqno = ntohl(*(__be32*)(h + offset));
469                         offset += 4;
470                 }
471         }
472
473         read_lock(&ipgre_lock);
474         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475                                         iph->saddr, iph->daddr, key)) != NULL) {
476                 struct net_device_stats *stats = &tunnel->dev->stats;
477
478                 secpath_reset(skb);
479
480                 skb->protocol = *(__be16*)(h + 2);
481                 /* WCCP version 1 and 2 protocol decoding.
482                  * - Change protocol to IP
483                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
484                  */
485                 if (flags == 0 &&
486                     skb->protocol == htons(ETH_P_WCCP)) {
487                         skb->protocol = htons(ETH_P_IP);
488                         if ((*(h + offset) & 0xF0) != 0x40)
489                                 offset += 4;
490                 }
491
492                 skb->mac_header = skb->network_header;
493                 __pskb_pull(skb, offset);
494                 skb_reset_network_header(skb);
495                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
496                 skb->pkt_type = PACKET_HOST;
497 #ifdef CONFIG_NET_IPGRE_BROADCAST
498                 if (ipv4_is_multicast(iph->daddr)) {
499                         /* Looped back packet, drop it! */
500                         if (skb->rtable->fl.iif == 0)
501                                 goto drop;
502                         stats->multicast++;
503                         skb->pkt_type = PACKET_BROADCAST;
504                 }
505 #endif
506
507                 if (((flags&GRE_CSUM) && csum) ||
508                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
509                         stats->rx_crc_errors++;
510                         stats->rx_errors++;
511                         goto drop;
512                 }
513                 if (tunnel->parms.i_flags&GRE_SEQ) {
514                         if (!(flags&GRE_SEQ) ||
515                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
516                                 stats->rx_fifo_errors++;
517                                 stats->rx_errors++;
518                                 goto drop;
519                         }
520                         tunnel->i_seqno = seqno + 1;
521                 }
522                 stats->rx_packets++;
523                 stats->rx_bytes += skb->len;
524                 skb->dev = tunnel->dev;
525                 dst_release(skb->dst);
526                 skb->dst = NULL;
527                 nf_reset(skb);
528                 ipgre_ecn_decapsulate(iph, skb);
529                 netif_rx(skb);
530                 read_unlock(&ipgre_lock);
531                 return(0);
532         }
533         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
534
535 drop:
536         read_unlock(&ipgre_lock);
537 drop_nolock:
538         kfree_skb(skb);
539         return(0);
540 }
541
542 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
543 {
544         struct ip_tunnel *tunnel = netdev_priv(dev);
545         struct net_device_stats *stats = &tunnel->dev->stats;
546         struct iphdr  *old_iph = ip_hdr(skb);
547         struct iphdr  *tiph;
548         u8     tos;
549         __be16 df;
550         struct rtable *rt;                      /* Route to the other host */
551         struct net_device *tdev;                        /* Device to other host */
552         struct iphdr  *iph;                     /* Our new IP header */
553         unsigned int max_headroom;              /* The extra header space needed */
554         int    gre_hlen;
555         __be32 dst;
556         int    mtu;
557
558         if (tunnel->recursion++) {
559                 stats->collisions++;
560                 goto tx_error;
561         }
562
563         if (dev->header_ops) {
564                 gre_hlen = 0;
565                 tiph = (struct iphdr*)skb->data;
566         } else {
567                 gre_hlen = tunnel->hlen;
568                 tiph = &tunnel->parms.iph;
569         }
570
571         if ((dst = tiph->daddr) == 0) {
572                 /* NBMA tunnel */
573
574                 if (skb->dst == NULL) {
575                         stats->tx_fifo_errors++;
576                         goto tx_error;
577                 }
578
579                 if (skb->protocol == htons(ETH_P_IP)) {
580                         rt = skb->rtable;
581                         if ((dst = rt->rt_gateway) == 0)
582                                 goto tx_error_icmp;
583                 }
584 #ifdef CONFIG_IPV6
585                 else if (skb->protocol == htons(ETH_P_IPV6)) {
586                         struct in6_addr *addr6;
587                         int addr_type;
588                         struct neighbour *neigh = skb->dst->neighbour;
589
590                         if (neigh == NULL)
591                                 goto tx_error;
592
593                         addr6 = (struct in6_addr*)&neigh->primary_key;
594                         addr_type = ipv6_addr_type(addr6);
595
596                         if (addr_type == IPV6_ADDR_ANY) {
597                                 addr6 = &ipv6_hdr(skb)->daddr;
598                                 addr_type = ipv6_addr_type(addr6);
599                         }
600
601                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
602                                 goto tx_error_icmp;
603
604                         dst = addr6->s6_addr32[3];
605                 }
606 #endif
607                 else
608                         goto tx_error;
609         }
610
611         tos = tiph->tos;
612         if (tos&1) {
613                 if (skb->protocol == htons(ETH_P_IP))
614                         tos = old_iph->tos;
615                 tos &= ~1;
616         }
617
618         {
619                 struct flowi fl = { .oif = tunnel->parms.link,
620                                     .nl_u = { .ip4_u =
621                                               { .daddr = dst,
622                                                 .saddr = tiph->saddr,
623                                                 .tos = RT_TOS(tos) } },
624                                     .proto = IPPROTO_GRE };
625                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
626                         stats->tx_carrier_errors++;
627                         goto tx_error;
628                 }
629         }
630         tdev = rt->u.dst.dev;
631
632         if (tdev == dev) {
633                 ip_rt_put(rt);
634                 stats->collisions++;
635                 goto tx_error;
636         }
637
638         df = tiph->frag_off;
639         if (df)
640                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
641         else
642                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
643
644         if (skb->dst)
645                 skb->dst->ops->update_pmtu(skb->dst, mtu);
646
647         if (skb->protocol == htons(ETH_P_IP)) {
648                 df |= (old_iph->frag_off&htons(IP_DF));
649
650                 if ((old_iph->frag_off&htons(IP_DF)) &&
651                     mtu < ntohs(old_iph->tot_len)) {
652                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
653                         ip_rt_put(rt);
654                         goto tx_error;
655                 }
656         }
657 #ifdef CONFIG_IPV6
658         else if (skb->protocol == htons(ETH_P_IPV6)) {
659                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
660
661                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
662                         if ((tunnel->parms.iph.daddr &&
663                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
664                             rt6->rt6i_dst.plen == 128) {
665                                 rt6->rt6i_flags |= RTF_MODIFIED;
666                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
667                         }
668                 }
669
670                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
671                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
672                         ip_rt_put(rt);
673                         goto tx_error;
674                 }
675         }
676 #endif
677
678         if (tunnel->err_count > 0) {
679                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
680                         tunnel->err_count--;
681
682                         dst_link_failure(skb);
683                 } else
684                         tunnel->err_count = 0;
685         }
686
687         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
688
689         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
690             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
691                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
692                 if (!new_skb) {
693                         ip_rt_put(rt);
694                         stats->tx_dropped++;
695                         dev_kfree_skb(skb);
696                         tunnel->recursion--;
697                         return 0;
698                 }
699                 if (skb->sk)
700                         skb_set_owner_w(new_skb, skb->sk);
701                 dev_kfree_skb(skb);
702                 skb = new_skb;
703                 old_iph = ip_hdr(skb);
704         }
705
706         skb->transport_header = skb->network_header;
707         skb_push(skb, gre_hlen);
708         skb_reset_network_header(skb);
709         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
710         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
711                               IPSKB_REROUTED);
712         dst_release(skb->dst);
713         skb->dst = &rt->u.dst;
714
715         /*
716          *      Push down and install the IPIP header.
717          */
718
719         iph                     =       ip_hdr(skb);
720         iph->version            =       4;
721         iph->ihl                =       sizeof(struct iphdr) >> 2;
722         iph->frag_off           =       df;
723         iph->protocol           =       IPPROTO_GRE;
724         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
725         iph->daddr              =       rt->rt_dst;
726         iph->saddr              =       rt->rt_src;
727
728         if ((iph->ttl = tiph->ttl) == 0) {
729                 if (skb->protocol == htons(ETH_P_IP))
730                         iph->ttl = old_iph->ttl;
731 #ifdef CONFIG_IPV6
732                 else if (skb->protocol == htons(ETH_P_IPV6))
733                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
734 #endif
735                 else
736                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
737         }
738
739         ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
740         ((__be16*)(iph+1))[1] = skb->protocol;
741
742         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
743                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
744
745                 if (tunnel->parms.o_flags&GRE_SEQ) {
746                         ++tunnel->o_seqno;
747                         *ptr = htonl(tunnel->o_seqno);
748                         ptr--;
749                 }
750                 if (tunnel->parms.o_flags&GRE_KEY) {
751                         *ptr = tunnel->parms.o_key;
752                         ptr--;
753                 }
754                 if (tunnel->parms.o_flags&GRE_CSUM) {
755                         *ptr = 0;
756                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
757                 }
758         }
759
760         nf_reset(skb);
761
762         IPTUNNEL_XMIT();
763         tunnel->recursion--;
764         return 0;
765
766 tx_error_icmp:
767         dst_link_failure(skb);
768
769 tx_error:
770         stats->tx_errors++;
771         dev_kfree_skb(skb);
772         tunnel->recursion--;
773         return 0;
774 }
775
776 static void ipgre_tunnel_bind_dev(struct net_device *dev)
777 {
778         struct net_device *tdev = NULL;
779         struct ip_tunnel *tunnel;
780         struct iphdr *iph;
781         int hlen = LL_MAX_HEADER;
782         int mtu = ETH_DATA_LEN;
783         int addend = sizeof(struct iphdr) + 4;
784
785         tunnel = netdev_priv(dev);
786         iph = &tunnel->parms.iph;
787
788         /* Guess output device to choose reasonable mtu and needed_headroom */
789
790         if (iph->daddr) {
791                 struct flowi fl = { .oif = tunnel->parms.link,
792                                     .nl_u = { .ip4_u =
793                                               { .daddr = iph->daddr,
794                                                 .saddr = iph->saddr,
795                                                 .tos = RT_TOS(iph->tos) } },
796                                     .proto = IPPROTO_GRE };
797                 struct rtable *rt;
798                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
799                         tdev = rt->u.dst.dev;
800                         ip_rt_put(rt);
801                 }
802                 dev->flags |= IFF_POINTOPOINT;
803         }
804
805         if (!tdev && tunnel->parms.link)
806                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
807
808         if (tdev) {
809                 hlen = tdev->hard_header_len + tdev->needed_headroom;
810                 mtu = tdev->mtu;
811         }
812         dev->iflink = tunnel->parms.link;
813
814         /* Precalculate GRE options length */
815         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
816                 if (tunnel->parms.o_flags&GRE_CSUM)
817                         addend += 4;
818                 if (tunnel->parms.o_flags&GRE_KEY)
819                         addend += 4;
820                 if (tunnel->parms.o_flags&GRE_SEQ)
821                         addend += 4;
822         }
823         dev->needed_headroom = addend + hlen;
824         dev->mtu = mtu - dev->hard_header_len - addend;
825         tunnel->hlen = addend;
826
827 }
828
829 static int
830 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
831 {
832         int err = 0;
833         struct ip_tunnel_parm p;
834         struct ip_tunnel *t;
835         struct net *net = dev_net(dev);
836         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
837
838         switch (cmd) {
839         case SIOCGETTUNNEL:
840                 t = NULL;
841                 if (dev == ign->fb_tunnel_dev) {
842                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
843                                 err = -EFAULT;
844                                 break;
845                         }
846                         t = ipgre_tunnel_locate(net, &p, 0);
847                 }
848                 if (t == NULL)
849                         t = netdev_priv(dev);
850                 memcpy(&p, &t->parms, sizeof(p));
851                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
852                         err = -EFAULT;
853                 break;
854
855         case SIOCADDTUNNEL:
856         case SIOCCHGTUNNEL:
857                 err = -EPERM;
858                 if (!capable(CAP_NET_ADMIN))
859                         goto done;
860
861                 err = -EFAULT;
862                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
863                         goto done;
864
865                 err = -EINVAL;
866                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
867                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
868                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
869                         goto done;
870                 if (p.iph.ttl)
871                         p.iph.frag_off |= htons(IP_DF);
872
873                 if (!(p.i_flags&GRE_KEY))
874                         p.i_key = 0;
875                 if (!(p.o_flags&GRE_KEY))
876                         p.o_key = 0;
877
878                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
879
880                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
881                         if (t != NULL) {
882                                 if (t->dev != dev) {
883                                         err = -EEXIST;
884                                         break;
885                                 }
886                         } else {
887                                 unsigned nflags=0;
888
889                                 t = netdev_priv(dev);
890
891                                 if (ipv4_is_multicast(p.iph.daddr))
892                                         nflags = IFF_BROADCAST;
893                                 else if (p.iph.daddr)
894                                         nflags = IFF_POINTOPOINT;
895
896                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
897                                         err = -EINVAL;
898                                         break;
899                                 }
900                                 ipgre_tunnel_unlink(ign, t);
901                                 t->parms.iph.saddr = p.iph.saddr;
902                                 t->parms.iph.daddr = p.iph.daddr;
903                                 t->parms.i_key = p.i_key;
904                                 t->parms.o_key = p.o_key;
905                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
906                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
907                                 ipgre_tunnel_link(ign, t);
908                                 netdev_state_change(dev);
909                         }
910                 }
911
912                 if (t) {
913                         err = 0;
914                         if (cmd == SIOCCHGTUNNEL) {
915                                 t->parms.iph.ttl = p.iph.ttl;
916                                 t->parms.iph.tos = p.iph.tos;
917                                 t->parms.iph.frag_off = p.iph.frag_off;
918                                 if (t->parms.link != p.link) {
919                                         t->parms.link = p.link;
920                                         ipgre_tunnel_bind_dev(dev);
921                                         netdev_state_change(dev);
922                                 }
923                         }
924                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
925                                 err = -EFAULT;
926                 } else
927                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
928                 break;
929
930         case SIOCDELTUNNEL:
931                 err = -EPERM;
932                 if (!capable(CAP_NET_ADMIN))
933                         goto done;
934
935                 if (dev == ign->fb_tunnel_dev) {
936                         err = -EFAULT;
937                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
938                                 goto done;
939                         err = -ENOENT;
940                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
941                                 goto done;
942                         err = -EPERM;
943                         if (t == netdev_priv(ign->fb_tunnel_dev))
944                                 goto done;
945                         dev = t->dev;
946                 }
947                 unregister_netdevice(dev);
948                 err = 0;
949                 break;
950
951         default:
952                 err = -EINVAL;
953         }
954
955 done:
956         return err;
957 }
958
959 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960 {
961         struct ip_tunnel *tunnel = netdev_priv(dev);
962         if (new_mtu < 68 ||
963             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
964                 return -EINVAL;
965         dev->mtu = new_mtu;
966         return 0;
967 }
968
969 /* Nice toy. Unfortunately, useless in real life :-)
970    It allows to construct virtual multiprotocol broadcast "LAN"
971    over the Internet, provided multicast routing is tuned.
972
973
974    I have no idea was this bicycle invented before me,
975    so that I had to set ARPHRD_IPGRE to a random value.
976    I have an impression, that Cisco could make something similar,
977    but this feature is apparently missing in IOS<=11.2(8).
978
979    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
980    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
981
982    ping -t 255 224.66.66.66
983
984    If nobody answers, mbone does not work.
985
986    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
987    ip addr add 10.66.66.<somewhat>/24 dev Universe
988    ifconfig Universe up
989    ifconfig Universe add fe80::<Your_real_addr>/10
990    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
991    ftp 10.66.66.66
992    ...
993    ftp fec0:6666:6666::193.233.7.65
994    ...
995
996  */
997
998 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
999                         unsigned short type,
1000                         const void *daddr, const void *saddr, unsigned len)
1001 {
1002         struct ip_tunnel *t = netdev_priv(dev);
1003         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1004         __be16 *p = (__be16*)(iph+1);
1005
1006         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1007         p[0]            = t->parms.o_flags;
1008         p[1]            = htons(type);
1009
1010         /*
1011          *      Set the source hardware address.
1012          */
1013
1014         if (saddr)
1015                 memcpy(&iph->saddr, saddr, 4);
1016
1017         if (daddr) {
1018                 memcpy(&iph->daddr, daddr, 4);
1019                 return t->hlen;
1020         }
1021         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1022                 return t->hlen;
1023
1024         return -t->hlen;
1025 }
1026
1027 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1028 {
1029         struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1030         memcpy(haddr, &iph->saddr, 4);
1031         return 4;
1032 }
1033
1034 static const struct header_ops ipgre_header_ops = {
1035         .create = ipgre_header,
1036         .parse  = ipgre_header_parse,
1037 };
1038
1039 #ifdef CONFIG_NET_IPGRE_BROADCAST
1040 static int ipgre_open(struct net_device *dev)
1041 {
1042         struct ip_tunnel *t = netdev_priv(dev);
1043
1044         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1045                 struct flowi fl = { .oif = t->parms.link,
1046                                     .nl_u = { .ip4_u =
1047                                               { .daddr = t->parms.iph.daddr,
1048                                                 .saddr = t->parms.iph.saddr,
1049                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1050                                     .proto = IPPROTO_GRE };
1051                 struct rtable *rt;
1052                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1053                         return -EADDRNOTAVAIL;
1054                 dev = rt->u.dst.dev;
1055                 ip_rt_put(rt);
1056                 if (__in_dev_get_rtnl(dev) == NULL)
1057                         return -EADDRNOTAVAIL;
1058                 t->mlink = dev->ifindex;
1059                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1060         }
1061         return 0;
1062 }
1063
1064 static int ipgre_close(struct net_device *dev)
1065 {
1066         struct ip_tunnel *t = netdev_priv(dev);
1067         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1068                 struct in_device *in_dev;
1069                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1070                 if (in_dev) {
1071                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1072                         in_dev_put(in_dev);
1073                 }
1074         }
1075         return 0;
1076 }
1077
1078 #endif
1079
1080 static void ipgre_tunnel_setup(struct net_device *dev)
1081 {
1082         dev->uninit             = ipgre_tunnel_uninit;
1083         dev->destructor         = free_netdev;
1084         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1085         dev->do_ioctl           = ipgre_tunnel_ioctl;
1086         dev->change_mtu         = ipgre_tunnel_change_mtu;
1087
1088         dev->type               = ARPHRD_IPGRE;
1089         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1090         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1091         dev->flags              = IFF_NOARP;
1092         dev->iflink             = 0;
1093         dev->addr_len           = 4;
1094         dev->features           |= NETIF_F_NETNS_LOCAL;
1095 }
1096
1097 static int ipgre_tunnel_init(struct net_device *dev)
1098 {
1099         struct ip_tunnel *tunnel;
1100         struct iphdr *iph;
1101
1102         tunnel = netdev_priv(dev);
1103         iph = &tunnel->parms.iph;
1104
1105         tunnel->dev = dev;
1106         strcpy(tunnel->parms.name, dev->name);
1107
1108         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1109         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1110
1111         ipgre_tunnel_bind_dev(dev);
1112
1113         if (iph->daddr) {
1114 #ifdef CONFIG_NET_IPGRE_BROADCAST
1115                 if (ipv4_is_multicast(iph->daddr)) {
1116                         if (!iph->saddr)
1117                                 return -EINVAL;
1118                         dev->flags = IFF_BROADCAST;
1119                         dev->header_ops = &ipgre_header_ops;
1120                         dev->open = ipgre_open;
1121                         dev->stop = ipgre_close;
1122                 }
1123 #endif
1124         } else
1125                 dev->header_ops = &ipgre_header_ops;
1126
1127         return 0;
1128 }
1129
1130 static int ipgre_fb_tunnel_init(struct net_device *dev)
1131 {
1132         struct ip_tunnel *tunnel = netdev_priv(dev);
1133         struct iphdr *iph = &tunnel->parms.iph;
1134         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1135
1136         tunnel->dev = dev;
1137         strcpy(tunnel->parms.name, dev->name);
1138
1139         iph->version            = 4;
1140         iph->protocol           = IPPROTO_GRE;
1141         iph->ihl                = 5;
1142         tunnel->hlen            = sizeof(struct iphdr) + 4;
1143
1144         dev_hold(dev);
1145         ign->tunnels_wc[0]      = tunnel;
1146         return 0;
1147 }
1148
1149
1150 static struct net_protocol ipgre_protocol = {
1151         .handler        =       ipgre_rcv,
1152         .err_handler    =       ipgre_err,
1153         .netns_ok       =       1,
1154 };
1155
1156 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1157 {
1158         int prio;
1159
1160         for (prio = 0; prio < 4; prio++) {
1161                 int h;
1162                 for (h = 0; h < HASH_SIZE; h++) {
1163                         struct ip_tunnel *t;
1164                         while ((t = ign->tunnels[prio][h]) != NULL)
1165                                 unregister_netdevice(t->dev);
1166                 }
1167         }
1168 }
1169
1170 static int ipgre_init_net(struct net *net)
1171 {
1172         int err;
1173         struct ipgre_net *ign;
1174
1175         err = -ENOMEM;
1176         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1177         if (ign == NULL)
1178                 goto err_alloc;
1179
1180         err = net_assign_generic(net, ipgre_net_id, ign);
1181         if (err < 0)
1182                 goto err_assign;
1183
1184         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1185                                            ipgre_tunnel_setup);
1186         if (!ign->fb_tunnel_dev) {
1187                 err = -ENOMEM;
1188                 goto err_alloc_dev;
1189         }
1190
1191         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1192         dev_net_set(ign->fb_tunnel_dev, net);
1193
1194         if ((err = register_netdev(ign->fb_tunnel_dev)))
1195                 goto err_reg_dev;
1196
1197         return 0;
1198
1199 err_reg_dev:
1200         free_netdev(ign->fb_tunnel_dev);
1201 err_alloc_dev:
1202         /* nothing */
1203 err_assign:
1204         kfree(ign);
1205 err_alloc:
1206         return err;
1207 }
1208
1209 static void ipgre_exit_net(struct net *net)
1210 {
1211         struct ipgre_net *ign;
1212
1213         ign = net_generic(net, ipgre_net_id);
1214         rtnl_lock();
1215         ipgre_destroy_tunnels(ign);
1216         rtnl_unlock();
1217         kfree(ign);
1218 }
1219
1220 static struct pernet_operations ipgre_net_ops = {
1221         .init = ipgre_init_net,
1222         .exit = ipgre_exit_net,
1223 };
1224
1225 /*
1226  *      And now the modules code and kernel interface.
1227  */
1228
1229 static int __init ipgre_init(void)
1230 {
1231         int err;
1232
1233         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1234
1235         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1236                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1237                 return -EAGAIN;
1238         }
1239
1240         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1241         if (err < 0)
1242                 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1243
1244         return err;
1245 }
1246
1247 static void __exit ipgre_fini(void)
1248 {
1249         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1250                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1251
1252         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1253 }
1254
1255 module_init(ipgre_init);
1256 module_exit(ipgre_fini);
1257 MODULE_LICENSE("GPL");