]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv4/ipvs/ip_vs_core.c
1f4f3b9435952a8de8129eb0baa5417baaa88177
[linux-2.6-omap-h63xx.git] / net / ipv4 / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/ip.h>
30 #include <linux/tcp.h>
31 #include <linux/icmp.h>
32
33 #include <net/ip.h>
34 #include <net/tcp.h>
35 #include <net/udp.h>
36 #include <net/icmp.h>                   /* for icmp_send */
37 #include <net/route.h>
38
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv4.h>
41
42 #ifdef CONFIG_IP_VS_IPV6
43 #include <net/ipv6.h>
44 #include <linux/netfilter_ipv6.h>
45 #endif
46
47 #include <net/ip_vs.h>
48
49
50 EXPORT_SYMBOL(register_ip_vs_scheduler);
51 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
52 EXPORT_SYMBOL(ip_vs_skb_replace);
53 EXPORT_SYMBOL(ip_vs_proto_name);
54 EXPORT_SYMBOL(ip_vs_conn_new);
55 EXPORT_SYMBOL(ip_vs_conn_in_get);
56 EXPORT_SYMBOL(ip_vs_conn_out_get);
57 #ifdef CONFIG_IP_VS_PROTO_TCP
58 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
59 #endif
60 EXPORT_SYMBOL(ip_vs_conn_put);
61 #ifdef CONFIG_IP_VS_DEBUG
62 EXPORT_SYMBOL(ip_vs_get_debug_level);
63 #endif
64
65
66 /* ID used in ICMP lookups */
67 #define icmp_id(icmph)          (((icmph)->un).echo.id)
68 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
69
70 const char *ip_vs_proto_name(unsigned proto)
71 {
72         static char buf[20];
73
74         switch (proto) {
75         case IPPROTO_IP:
76                 return "IP";
77         case IPPROTO_UDP:
78                 return "UDP";
79         case IPPROTO_TCP:
80                 return "TCP";
81         case IPPROTO_ICMP:
82                 return "ICMP";
83 #ifdef CONFIG_IP_VS_IPV6
84         case IPPROTO_ICMPV6:
85                 return "ICMPv6";
86 #endif
87         default:
88                 sprintf(buf, "IP_%d", proto);
89                 return buf;
90         }
91 }
92
93 void ip_vs_init_hash_table(struct list_head *table, int rows)
94 {
95         while (--rows >= 0)
96                 INIT_LIST_HEAD(&table[rows]);
97 }
98
99 static inline void
100 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
101 {
102         struct ip_vs_dest *dest = cp->dest;
103         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
104                 spin_lock(&dest->stats.lock);
105                 dest->stats.inpkts++;
106                 dest->stats.inbytes += skb->len;
107                 spin_unlock(&dest->stats.lock);
108
109                 spin_lock(&dest->svc->stats.lock);
110                 dest->svc->stats.inpkts++;
111                 dest->svc->stats.inbytes += skb->len;
112                 spin_unlock(&dest->svc->stats.lock);
113
114                 spin_lock(&ip_vs_stats.lock);
115                 ip_vs_stats.inpkts++;
116                 ip_vs_stats.inbytes += skb->len;
117                 spin_unlock(&ip_vs_stats.lock);
118         }
119 }
120
121
122 static inline void
123 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
124 {
125         struct ip_vs_dest *dest = cp->dest;
126         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
127                 spin_lock(&dest->stats.lock);
128                 dest->stats.outpkts++;
129                 dest->stats.outbytes += skb->len;
130                 spin_unlock(&dest->stats.lock);
131
132                 spin_lock(&dest->svc->stats.lock);
133                 dest->svc->stats.outpkts++;
134                 dest->svc->stats.outbytes += skb->len;
135                 spin_unlock(&dest->svc->stats.lock);
136
137                 spin_lock(&ip_vs_stats.lock);
138                 ip_vs_stats.outpkts++;
139                 ip_vs_stats.outbytes += skb->len;
140                 spin_unlock(&ip_vs_stats.lock);
141         }
142 }
143
144
145 static inline void
146 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
147 {
148         spin_lock(&cp->dest->stats.lock);
149         cp->dest->stats.conns++;
150         spin_unlock(&cp->dest->stats.lock);
151
152         spin_lock(&svc->stats.lock);
153         svc->stats.conns++;
154         spin_unlock(&svc->stats.lock);
155
156         spin_lock(&ip_vs_stats.lock);
157         ip_vs_stats.conns++;
158         spin_unlock(&ip_vs_stats.lock);
159 }
160
161
162 static inline int
163 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
164                 const struct sk_buff *skb,
165                 struct ip_vs_protocol *pp)
166 {
167         if (unlikely(!pp->state_transition))
168                 return 0;
169         return pp->state_transition(cp, direction, skb, pp);
170 }
171
172
173 /*
174  *  IPVS persistent scheduling function
175  *  It creates a connection entry according to its template if exists,
176  *  or selects a server and creates a connection entry plus a template.
177  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
178  *  Protocols supported: TCP, UDP
179  */
180 static struct ip_vs_conn *
181 ip_vs_sched_persist(struct ip_vs_service *svc,
182                     const struct sk_buff *skb,
183                     __be16 ports[2])
184 {
185         struct ip_vs_conn *cp = NULL;
186         struct ip_vs_iphdr iph;
187         struct ip_vs_dest *dest;
188         struct ip_vs_conn *ct;
189         __be16  dport;                  /* destination port to forward */
190         union nf_inet_addr snet;        /* source network of the client,
191                                            after masking */
192
193         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
194
195         /* Mask saddr with the netmask to adjust template granularity */
196 #ifdef CONFIG_IP_VS_IPV6
197         if (svc->af == AF_INET6)
198                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
199         else
200 #endif
201                 snet.ip = iph.saddr.ip & svc->netmask;
202
203         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
204                       "mnet %s\n",
205                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
206                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
207                       IP_VS_DBG_ADDR(svc->af, &snet));
208
209         /*
210          * As far as we know, FTP is a very complicated network protocol, and
211          * it uses control connection and data connections. For active FTP,
212          * FTP server initialize data connection to the client, its source port
213          * is often 20. For passive FTP, FTP server tells the clients the port
214          * that it passively listens to,  and the client issues the data
215          * connection. In the tunneling or direct routing mode, the load
216          * balancer is on the client-to-server half of connection, the port
217          * number is unknown to the load balancer. So, a conn template like
218          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
219          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
220          * is created for other persistent services.
221          */
222         if (ports[1] == svc->port) {
223                 /* Check if a template already exists */
224                 if (svc->port != FTPPORT)
225                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
226                                              &iph.daddr, ports[1]);
227                 else
228                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
229                                              &iph.daddr, 0);
230
231                 if (!ct || !ip_vs_check_template(ct)) {
232                         /*
233                          * No template found or the dest of the connection
234                          * template is not available.
235                          */
236                         dest = svc->scheduler->schedule(svc, skb);
237                         if (dest == NULL) {
238                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
239                                 return NULL;
240                         }
241
242                         /*
243                          * Create a template like <protocol,caddr,0,
244                          * vaddr,vport,daddr,dport> for non-ftp service,
245                          * and <protocol,caddr,0,vaddr,0,daddr,0>
246                          * for ftp service.
247                          */
248                         if (svc->port != FTPPORT)
249                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
250                                                     &snet, 0,
251                                                     &iph.daddr,
252                                                     ports[1],
253                                                     &dest->addr, dest->port,
254                                                     IP_VS_CONN_F_TEMPLATE,
255                                                     dest);
256                         else
257                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
258                                                     &snet, 0,
259                                                     &iph.daddr, 0,
260                                                     &dest->addr, 0,
261                                                     IP_VS_CONN_F_TEMPLATE,
262                                                     dest);
263                         if (ct == NULL)
264                                 return NULL;
265
266                         ct->timeout = svc->timeout;
267                 } else {
268                         /* set destination with the found template */
269                         dest = ct->dest;
270                 }
271                 dport = dest->port;
272         } else {
273                 /*
274                  * Note: persistent fwmark-based services and persistent
275                  * port zero service are handled here.
276                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
277                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
278                  */
279                 if (svc->fwmark) {
280                         union nf_inet_addr fwmark = {
281                                 .all = { 0, 0, 0, htonl(svc->fwmark) }
282                         };
283
284                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
285                                              &fwmark, 0);
286                 } else
287                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
288                                              &iph.daddr, 0);
289
290                 if (!ct || !ip_vs_check_template(ct)) {
291                         /*
292                          * If it is not persistent port zero, return NULL,
293                          * otherwise create a connection template.
294                          */
295                         if (svc->port)
296                                 return NULL;
297
298                         dest = svc->scheduler->schedule(svc, skb);
299                         if (dest == NULL) {
300                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
301                                 return NULL;
302                         }
303
304                         /*
305                          * Create a template according to the service
306                          */
307                         if (svc->fwmark) {
308                                 union nf_inet_addr fwmark = {
309                                         .all = { 0, 0, 0, htonl(svc->fwmark) }
310                                 };
311
312                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
313                                                     &snet, 0,
314                                                     &fwmark, 0,
315                                                     &dest->addr, 0,
316                                                     IP_VS_CONN_F_TEMPLATE,
317                                                     dest);
318                         } else
319                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
320                                                     &snet, 0,
321                                                     &iph.daddr, 0,
322                                                     &dest->addr, 0,
323                                                     IP_VS_CONN_F_TEMPLATE,
324                                                     dest);
325                         if (ct == NULL)
326                                 return NULL;
327
328                         ct->timeout = svc->timeout;
329                 } else {
330                         /* set destination with the found template */
331                         dest = ct->dest;
332                 }
333                 dport = ports[1];
334         }
335
336         /*
337          *    Create a new connection according to the template
338          */
339         cp = ip_vs_conn_new(svc->af, iph.protocol,
340                             &iph.saddr, ports[0],
341                             &iph.daddr, ports[1],
342                             &dest->addr, dport,
343                             0,
344                             dest);
345         if (cp == NULL) {
346                 ip_vs_conn_put(ct);
347                 return NULL;
348         }
349
350         /*
351          *    Add its control
352          */
353         ip_vs_control_add(cp, ct);
354         ip_vs_conn_put(ct);
355
356         ip_vs_conn_stats(cp, svc);
357         return cp;
358 }
359
360
361 /*
362  *  IPVS main scheduling function
363  *  It selects a server according to the virtual service, and
364  *  creates a connection entry.
365  *  Protocols supported: TCP, UDP
366  */
367 struct ip_vs_conn *
368 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
369 {
370         struct ip_vs_conn *cp = NULL;
371         struct ip_vs_iphdr iph;
372         struct ip_vs_dest *dest;
373         __be16 _ports[2], *pptr;
374
375         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
376         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
377         if (pptr == NULL)
378                 return NULL;
379
380         /*
381          *    Persistent service
382          */
383         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
384                 return ip_vs_sched_persist(svc, skb, pptr);
385
386         /*
387          *    Non-persistent service
388          */
389         if (!svc->fwmark && pptr[1] != svc->port) {
390                 if (!svc->port)
391                         IP_VS_ERR("Schedule: port zero only supported "
392                                   "in persistent services, "
393                                   "check your ipvs configuration\n");
394                 return NULL;
395         }
396
397         dest = svc->scheduler->schedule(svc, skb);
398         if (dest == NULL) {
399                 IP_VS_DBG(1, "Schedule: no dest found.\n");
400                 return NULL;
401         }
402
403         /*
404          *    Create a connection entry.
405          */
406         cp = ip_vs_conn_new(svc->af, iph.protocol,
407                             &iph.saddr, pptr[0],
408                             &iph.daddr, pptr[1],
409                             &dest->addr, dest->port ? dest->port : pptr[1],
410                             0,
411                             dest);
412         if (cp == NULL)
413                 return NULL;
414
415         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
416                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
417                       ip_vs_fwd_tag(cp),
418                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
419                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
420                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
421                       cp->flags, atomic_read(&cp->refcnt));
422
423         ip_vs_conn_stats(cp, svc);
424         return cp;
425 }
426
427
428 /*
429  *  Pass or drop the packet.
430  *  Called by ip_vs_in, when the virtual service is available but
431  *  no destination is available for a new connection.
432  */
433 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
434                 struct ip_vs_protocol *pp)
435 {
436         __be16 _ports[2], *pptr;
437         struct ip_vs_iphdr iph;
438         int unicast;
439         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
440
441         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
442         if (pptr == NULL) {
443                 ip_vs_service_put(svc);
444                 return NF_DROP;
445         }
446
447 #ifdef CONFIG_IP_VS_IPV6
448         if (svc->af == AF_INET6)
449                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
450         else
451 #endif
452                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
453
454         /* if it is fwmark-based service, the cache_bypass sysctl is up
455            and the destination is a non-local unicast, then create
456            a cache_bypass connection entry */
457         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
458                 int ret, cs;
459                 struct ip_vs_conn *cp;
460
461                 ip_vs_service_put(svc);
462
463                 /* create a new connection entry */
464                 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
465                 cp = ip_vs_conn_new(svc->af, iph.protocol,
466                                     &iph.saddr, pptr[0],
467                                     &iph.daddr, pptr[1],
468                                     0, 0,
469                                     IP_VS_CONN_F_BYPASS,
470                                     NULL);
471                 if (cp == NULL)
472                         return NF_DROP;
473
474                 /* statistics */
475                 ip_vs_in_stats(cp, skb);
476
477                 /* set state */
478                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
479
480                 /* transmit the first SYN packet */
481                 ret = cp->packet_xmit(skb, cp, pp);
482                 /* do not touch skb anymore */
483
484                 atomic_inc(&cp->in_pkts);
485                 ip_vs_conn_put(cp);
486                 return ret;
487         }
488
489         /*
490          * When the virtual ftp service is presented, packets destined
491          * for other services on the VIP may get here (except services
492          * listed in the ipvs table), pass the packets, because it is
493          * not ipvs job to decide to drop the packets.
494          */
495         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
496                 ip_vs_service_put(svc);
497                 return NF_ACCEPT;
498         }
499
500         ip_vs_service_put(svc);
501
502         /*
503          * Notify the client that the destination is unreachable, and
504          * release the socket buffer.
505          * Since it is in IP layer, the TCP socket is not actually
506          * created, the TCP RST packet cannot be sent, instead that
507          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
508          */
509 #ifdef CONFIG_IP_VS_IPV6
510         if (svc->af == AF_INET6)
511                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
512                             skb->dev);
513         else
514 #endif
515                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
516
517         return NF_DROP;
518 }
519
520
521 /*
522  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
523  *      chain, and is used for VS/NAT.
524  *      It detects packets for VS/NAT connections and sends the packets
525  *      immediately. This can avoid that iptable_nat mangles the packets
526  *      for VS/NAT.
527  */
528 static unsigned int ip_vs_post_routing(unsigned int hooknum,
529                                        struct sk_buff *skb,
530                                        const struct net_device *in,
531                                        const struct net_device *out,
532                                        int (*okfn)(struct sk_buff *))
533 {
534         if (!skb->ipvs_property)
535                 return NF_ACCEPT;
536         /* The packet was sent from IPVS, exit this chain */
537         return NF_STOP;
538 }
539
540 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
541 {
542         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
543 }
544
545 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
546 {
547         int err = ip_defrag(skb, user);
548
549         if (!err)
550                 ip_send_check(ip_hdr(skb));
551
552         return err;
553 }
554
555 #ifdef CONFIG_IP_VS_IPV6
556 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
557 {
558         /* TODO IPv6: Find out what to do here for IPv6 */
559         return 0;
560 }
561 #endif
562
563 /*
564  * Packet has been made sufficiently writable in caller
565  * - inout: 1=in->out, 0=out->in
566  */
567 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
568                     struct ip_vs_conn *cp, int inout)
569 {
570         struct iphdr *iph        = ip_hdr(skb);
571         unsigned int icmp_offset = iph->ihl*4;
572         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
573                                                       icmp_offset);
574         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
575
576         if (inout) {
577                 iph->saddr = cp->vaddr.ip;
578                 ip_send_check(iph);
579                 ciph->daddr = cp->vaddr.ip;
580                 ip_send_check(ciph);
581         } else {
582                 iph->daddr = cp->daddr.ip;
583                 ip_send_check(iph);
584                 ciph->saddr = cp->daddr.ip;
585                 ip_send_check(ciph);
586         }
587
588         /* the TCP/UDP port */
589         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
590                 __be16 *ports = (void *)ciph + ciph->ihl*4;
591
592                 if (inout)
593                         ports[1] = cp->vport;
594                 else
595                         ports[0] = cp->dport;
596         }
597
598         /* And finally the ICMP checksum */
599         icmph->checksum = 0;
600         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
601         skb->ip_summed = CHECKSUM_UNNECESSARY;
602
603         if (inout)
604                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
605                         "Forwarding altered outgoing ICMP");
606         else
607                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
608                         "Forwarding altered incoming ICMP");
609 }
610
611 #ifdef CONFIG_IP_VS_IPV6
612 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
613                     struct ip_vs_conn *cp, int inout)
614 {
615         struct ipv6hdr *iph      = ipv6_hdr(skb);
616         unsigned int icmp_offset = sizeof(struct ipv6hdr);
617         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
618                                                       icmp_offset);
619         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
620
621         if (inout) {
622                 iph->saddr = cp->vaddr.in6;
623                 ciph->daddr = cp->vaddr.in6;
624         } else {
625                 iph->daddr = cp->daddr.in6;
626                 ciph->saddr = cp->daddr.in6;
627         }
628
629         /* the TCP/UDP port */
630         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
631                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
632
633                 if (inout)
634                         ports[1] = cp->vport;
635                 else
636                         ports[0] = cp->dport;
637         }
638
639         /* And finally the ICMP checksum */
640         icmph->icmp6_cksum = 0;
641         /* TODO IPv6: is this correct for ICMPv6? */
642         ip_vs_checksum_complete(skb, icmp_offset);
643         skb->ip_summed = CHECKSUM_UNNECESSARY;
644
645         if (inout)
646                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
647                         "Forwarding altered outgoing ICMPv6");
648         else
649                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
650                         "Forwarding altered incoming ICMPv6");
651 }
652 #endif
653
654 /* Handle relevant response ICMP messages - forward to the right
655  * destination host. Used for NAT and local client.
656  */
657 static int handle_response_icmp(int af, struct sk_buff *skb,
658                                 union nf_inet_addr *snet,
659                                 __u8 protocol, struct ip_vs_conn *cp,
660                                 struct ip_vs_protocol *pp,
661                                 unsigned int offset, unsigned int ihl)
662 {
663         unsigned int verdict = NF_DROP;
664
665         if (IP_VS_FWD_METHOD(cp) != 0) {
666                 IP_VS_ERR("shouldn't reach here, because the box is on the "
667                           "half connection in the tun/dr module.\n");
668         }
669
670         /* Ensure the checksum is correct */
671         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
672                 /* Failed checksum! */
673                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
674                               IP_VS_DBG_ADDR(af, snet));
675                 goto out;
676         }
677
678         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
679                 offset += 2 * sizeof(__u16);
680         if (!skb_make_writable(skb, offset))
681                 goto out;
682
683 #ifdef CONFIG_IP_VS_IPV6
684         if (af == AF_INET6)
685                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
686         else
687 #endif
688                 ip_vs_nat_icmp(skb, pp, cp, 1);
689
690         /* do the statistics and put it back */
691         ip_vs_out_stats(cp, skb);
692
693         skb->ipvs_property = 1;
694         verdict = NF_ACCEPT;
695
696 out:
697         __ip_vs_conn_put(cp);
698
699         return verdict;
700 }
701
702 /*
703  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
704  *      Find any that might be relevant, check against existing connections.
705  *      Currently handles error types - unreachable, quench, ttl exceeded.
706  */
707 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
708 {
709         struct iphdr *iph;
710         struct icmphdr  _icmph, *ic;
711         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
712         struct ip_vs_iphdr ciph;
713         struct ip_vs_conn *cp;
714         struct ip_vs_protocol *pp;
715         unsigned int offset, ihl;
716         union nf_inet_addr snet;
717
718         *related = 1;
719
720         /* reassemble IP fragments */
721         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
722                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
723                         return NF_STOLEN;
724         }
725
726         iph = ip_hdr(skb);
727         offset = ihl = iph->ihl * 4;
728         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
729         if (ic == NULL)
730                 return NF_DROP;
731
732         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
733                   ic->type, ntohs(icmp_id(ic)),
734                   NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
735
736         /*
737          * Work through seeing if this is for us.
738          * These checks are supposed to be in an order that means easy
739          * things are checked first to speed up processing.... however
740          * this means that some packets will manage to get a long way
741          * down this stack and then be rejected, but that's life.
742          */
743         if ((ic->type != ICMP_DEST_UNREACH) &&
744             (ic->type != ICMP_SOURCE_QUENCH) &&
745             (ic->type != ICMP_TIME_EXCEEDED)) {
746                 *related = 0;
747                 return NF_ACCEPT;
748         }
749
750         /* Now find the contained IP header */
751         offset += sizeof(_icmph);
752         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
753         if (cih == NULL)
754                 return NF_ACCEPT; /* The packet looks wrong, ignore */
755
756         pp = ip_vs_proto_get(cih->protocol);
757         if (!pp)
758                 return NF_ACCEPT;
759
760         /* Is the embedded protocol header present? */
761         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
762                      pp->dont_defrag))
763                 return NF_ACCEPT;
764
765         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
766
767         offset += cih->ihl * 4;
768
769         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
770         /* The embedded headers contain source and dest in reverse order */
771         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
772         if (!cp)
773                 return NF_ACCEPT;
774
775         snet.ip = iph->saddr;
776         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
777                                     pp, offset, ihl);
778 }
779
780 #ifdef CONFIG_IP_VS_IPV6
781 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
782 {
783         struct ipv6hdr *iph;
784         struct icmp6hdr _icmph, *ic;
785         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
786                                            within the ICMP */
787         struct ip_vs_iphdr ciph;
788         struct ip_vs_conn *cp;
789         struct ip_vs_protocol *pp;
790         unsigned int offset;
791         union nf_inet_addr snet;
792
793         *related = 1;
794
795         /* reassemble IP fragments */
796         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
797                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
798                         return NF_STOLEN;
799         }
800
801         iph = ipv6_hdr(skb);
802         offset = sizeof(struct ipv6hdr);
803         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
804         if (ic == NULL)
805                 return NF_DROP;
806
807         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
808                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
809                   NIP6(iph->saddr), NIP6(iph->daddr));
810
811         /*
812          * Work through seeing if this is for us.
813          * These checks are supposed to be in an order that means easy
814          * things are checked first to speed up processing.... however
815          * this means that some packets will manage to get a long way
816          * down this stack and then be rejected, but that's life.
817          */
818         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
819             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
820             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
821                 *related = 0;
822                 return NF_ACCEPT;
823         }
824
825         /* Now find the contained IP header */
826         offset += sizeof(_icmph);
827         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
828         if (cih == NULL)
829                 return NF_ACCEPT; /* The packet looks wrong, ignore */
830
831         pp = ip_vs_proto_get(cih->nexthdr);
832         if (!pp)
833                 return NF_ACCEPT;
834
835         /* Is the embedded protocol header present? */
836         /* TODO: we don't support fragmentation at the moment anyways */
837         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
838                 return NF_ACCEPT;
839
840         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
841
842         offset += sizeof(struct ipv6hdr);
843
844         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
845         /* The embedded headers contain source and dest in reverse order */
846         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
847         if (!cp)
848                 return NF_ACCEPT;
849
850         snet.in6 = iph->saddr;
851         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
852                                     pp, offset, sizeof(struct ipv6hdr));
853 }
854 #endif
855
856 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
857 {
858         struct tcphdr _tcph, *th;
859
860         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
861         if (th == NULL)
862                 return 0;
863         return th->rst;
864 }
865
866 /* Handle response packets: rewrite addresses and send away...
867  * Used for NAT and local client.
868  */
869 static unsigned int
870 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
871                 struct ip_vs_conn *cp, int ihl)
872 {
873         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
874
875         if (!skb_make_writable(skb, ihl))
876                 goto drop;
877
878         /* mangle the packet */
879         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
880                 goto drop;
881
882 #ifdef CONFIG_IP_VS_IPV6
883         if (af == AF_INET6)
884                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
885         else
886 #endif
887         {
888                 ip_hdr(skb)->saddr = cp->vaddr.ip;
889                 ip_send_check(ip_hdr(skb));
890         }
891
892         /* For policy routing, packets originating from this
893          * machine itself may be routed differently to packets
894          * passing through.  We want this packet to be routed as
895          * if it came from this machine itself.  So re-compute
896          * the routing information.
897          */
898 #ifdef CONFIG_IP_VS_IPV6
899         if (af == AF_INET6) {
900                 if (ip6_route_me_harder(skb) != 0)
901                         goto drop;
902         } else
903 #endif
904                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
905                         goto drop;
906
907         /* For policy routing, packets originating from this
908          * machine itself may be routed differently to packets
909          * passing through.  We want this packet to be routed as
910          * if it came from this machine itself.  So re-compute
911          * the routing information.
912          */
913         if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
914                 goto drop;
915
916         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
917
918         ip_vs_out_stats(cp, skb);
919         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
920         ip_vs_conn_put(cp);
921
922         skb->ipvs_property = 1;
923
924         LeaveFunction(11);
925         return NF_ACCEPT;
926
927 drop:
928         ip_vs_conn_put(cp);
929         kfree_skb(skb);
930         return NF_STOLEN;
931 }
932
933 /*
934  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
935  *      Check if outgoing packet belongs to the established ip_vs_conn.
936  */
937 static unsigned int
938 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
939           const struct net_device *in, const struct net_device *out,
940           int (*okfn)(struct sk_buff *))
941 {
942         struct ip_vs_iphdr iph;
943         struct ip_vs_protocol *pp;
944         struct ip_vs_conn *cp;
945         int af;
946
947         EnterFunction(11);
948
949         af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
950
951         if (skb->ipvs_property)
952                 return NF_ACCEPT;
953
954         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
955 #ifdef CONFIG_IP_VS_IPV6
956         if (af == AF_INET6) {
957                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
958                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
959
960                         if (related)
961                                 return verdict;
962                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
963                 }
964         } else
965 #endif
966                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
967                         int related, verdict = ip_vs_out_icmp(skb, &related);
968
969                         if (related)
970                                 return verdict;
971                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
972                 }
973
974         pp = ip_vs_proto_get(iph.protocol);
975         if (unlikely(!pp))
976                 return NF_ACCEPT;
977
978         /* reassemble IP fragments */
979 #ifdef CONFIG_IP_VS_IPV6
980         if (af == AF_INET6) {
981                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
982                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
983
984                         if (related)
985                                 return verdict;
986
987                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
988                 }
989         } else
990 #endif
991                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
992                              !pp->dont_defrag)) {
993                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
994                                 return NF_STOLEN;
995
996                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
997                 }
998
999         /*
1000          * Check if the packet belongs to an existing entry
1001          */
1002         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1003
1004         if (unlikely(!cp)) {
1005                 if (sysctl_ip_vs_nat_icmp_send &&
1006                     (pp->protocol == IPPROTO_TCP ||
1007                      pp->protocol == IPPROTO_UDP)) {
1008                         __be16 _ports[2], *pptr;
1009
1010                         pptr = skb_header_pointer(skb, iph.len,
1011                                                   sizeof(_ports), _ports);
1012                         if (pptr == NULL)
1013                                 return NF_ACCEPT;       /* Not for me */
1014                         if (ip_vs_lookup_real_service(af, iph.protocol,
1015                                                       &iph.saddr,
1016                                                       pptr[0])) {
1017                                 /*
1018                                  * Notify the real server: there is no
1019                                  * existing entry if it is not RST
1020                                  * packet or not TCP packet.
1021                                  */
1022                                 if (iph.protocol != IPPROTO_TCP
1023                                     || !is_tcp_reset(skb, iph.len)) {
1024 #ifdef CONFIG_IP_VS_IPV6
1025                                         if (af == AF_INET6)
1026                                                 icmpv6_send(skb,
1027                                                             ICMPV6_DEST_UNREACH,
1028                                                             ICMPV6_PORT_UNREACH,
1029                                                             0, skb->dev);
1030                                         else
1031 #endif
1032                                                 icmp_send(skb,
1033                                                           ICMP_DEST_UNREACH,
1034                                                           ICMP_PORT_UNREACH, 0);
1035                                         return NF_DROP;
1036                                 }
1037                         }
1038                 }
1039                 IP_VS_DBG_PKT(12, pp, skb, 0,
1040                               "packet continues traversal as normal");
1041                 return NF_ACCEPT;
1042         }
1043
1044         return handle_response(af, skb, pp, cp, iph.len);
1045 }
1046
1047
1048 /*
1049  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1050  *      Find any that might be relevant, check against existing connections,
1051  *      forward to the right destination host if relevant.
1052  *      Currently handles error types - unreachable, quench, ttl exceeded.
1053  */
1054 static int
1055 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1056 {
1057         struct iphdr *iph;
1058         struct icmphdr  _icmph, *ic;
1059         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1060         struct ip_vs_iphdr ciph;
1061         struct ip_vs_conn *cp;
1062         struct ip_vs_protocol *pp;
1063         unsigned int offset, ihl, verdict;
1064         union nf_inet_addr snet;
1065
1066         *related = 1;
1067
1068         /* reassemble IP fragments */
1069         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1070                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1071                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1072                         return NF_STOLEN;
1073         }
1074
1075         iph = ip_hdr(skb);
1076         offset = ihl = iph->ihl * 4;
1077         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1078         if (ic == NULL)
1079                 return NF_DROP;
1080
1081         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
1082                   ic->type, ntohs(icmp_id(ic)),
1083                   NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1084
1085         /*
1086          * Work through seeing if this is for us.
1087          * These checks are supposed to be in an order that means easy
1088          * things are checked first to speed up processing.... however
1089          * this means that some packets will manage to get a long way
1090          * down this stack and then be rejected, but that's life.
1091          */
1092         if ((ic->type != ICMP_DEST_UNREACH) &&
1093             (ic->type != ICMP_SOURCE_QUENCH) &&
1094             (ic->type != ICMP_TIME_EXCEEDED)) {
1095                 *related = 0;
1096                 return NF_ACCEPT;
1097         }
1098
1099         /* Now find the contained IP header */
1100         offset += sizeof(_icmph);
1101         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1102         if (cih == NULL)
1103                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1104
1105         pp = ip_vs_proto_get(cih->protocol);
1106         if (!pp)
1107                 return NF_ACCEPT;
1108
1109         /* Is the embedded protocol header present? */
1110         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1111                      pp->dont_defrag))
1112                 return NF_ACCEPT;
1113
1114         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1115
1116         offset += cih->ihl * 4;
1117
1118         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1119         /* The embedded headers contain source and dest in reverse order */
1120         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1121         if (!cp) {
1122                 /* The packet could also belong to a local client */
1123                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1124                 if (cp) {
1125                         snet.ip = iph->saddr;
1126                         return handle_response_icmp(AF_INET, skb, &snet,
1127                                                     cih->protocol, cp, pp,
1128                                                     offset, ihl);
1129                 }
1130                 return NF_ACCEPT;
1131         }
1132
1133         verdict = NF_DROP;
1134
1135         /* Ensure the checksum is correct */
1136         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1137                 /* Failed checksum! */
1138                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
1139                           NIPQUAD(iph->saddr));
1140                 goto out;
1141         }
1142
1143         /* do the statistics and put it back */
1144         ip_vs_in_stats(cp, skb);
1145         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1146                 offset += 2 * sizeof(__u16);
1147         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1148         /* do not touch skb anymore */
1149
1150   out:
1151         __ip_vs_conn_put(cp);
1152
1153         return verdict;
1154 }
1155
1156 #ifdef CONFIG_IP_VS_IPV6
1157 static int
1158 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1159 {
1160         struct ipv6hdr *iph;
1161         struct icmp6hdr _icmph, *ic;
1162         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1163                                            within the ICMP */
1164         struct ip_vs_iphdr ciph;
1165         struct ip_vs_conn *cp;
1166         struct ip_vs_protocol *pp;
1167         unsigned int offset, verdict;
1168         union nf_inet_addr snet;
1169
1170         *related = 1;
1171
1172         /* reassemble IP fragments */
1173         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1174                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1175                                                IP_DEFRAG_VS_IN :
1176                                                IP_DEFRAG_VS_FWD))
1177                         return NF_STOLEN;
1178         }
1179
1180         iph = ipv6_hdr(skb);
1181         offset = sizeof(struct ipv6hdr);
1182         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1183         if (ic == NULL)
1184                 return NF_DROP;
1185
1186         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
1187                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1188                   NIP6(iph->saddr), NIP6(iph->daddr));
1189
1190         /*
1191          * Work through seeing if this is for us.
1192          * These checks are supposed to be in an order that means easy
1193          * things are checked first to speed up processing.... however
1194          * this means that some packets will manage to get a long way
1195          * down this stack and then be rejected, but that's life.
1196          */
1197         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1198             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1199             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1200                 *related = 0;
1201                 return NF_ACCEPT;
1202         }
1203
1204         /* Now find the contained IP header */
1205         offset += sizeof(_icmph);
1206         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1207         if (cih == NULL)
1208                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1209
1210         pp = ip_vs_proto_get(cih->nexthdr);
1211         if (!pp)
1212                 return NF_ACCEPT;
1213
1214         /* Is the embedded protocol header present? */
1215         /* TODO: we don't support fragmentation at the moment anyways */
1216         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1217                 return NF_ACCEPT;
1218
1219         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1220
1221         offset += sizeof(struct ipv6hdr);
1222
1223         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1224         /* The embedded headers contain source and dest in reverse order */
1225         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1226         if (!cp) {
1227                 /* The packet could also belong to a local client */
1228                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1229                 if (cp) {
1230                         snet.in6 = iph->saddr;
1231                         return handle_response_icmp(AF_INET6, skb, &snet,
1232                                                     cih->nexthdr,
1233                                                     cp, pp, offset,
1234                                                     sizeof(struct ipv6hdr));
1235                 }
1236                 return NF_ACCEPT;
1237         }
1238
1239         verdict = NF_DROP;
1240
1241         /* do the statistics and put it back */
1242         ip_vs_in_stats(cp, skb);
1243         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1244                 offset += 2 * sizeof(__u16);
1245         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1246         /* do not touch skb anymore */
1247
1248         __ip_vs_conn_put(cp);
1249
1250         return verdict;
1251 }
1252 #endif
1253
1254
1255 /*
1256  *      Check if it's for virtual services, look it up,
1257  *      and send it on its way...
1258  */
1259 static unsigned int
1260 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1261          const struct net_device *in, const struct net_device *out,
1262          int (*okfn)(struct sk_buff *))
1263 {
1264         struct ip_vs_iphdr iph;
1265         struct ip_vs_protocol *pp;
1266         struct ip_vs_conn *cp;
1267         int ret, restart, af;
1268
1269         af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1270
1271         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1272
1273         /*
1274          *      Big tappo: only PACKET_HOST, including loopback for local client
1275          *      Don't handle local packets on IPv6 for now
1276          */
1277         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1278                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1279                               skb->pkt_type,
1280                               iph.protocol,
1281                               IP_VS_DBG_ADDR(af, &iph.daddr));
1282                 return NF_ACCEPT;
1283         }
1284
1285         if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1286                 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1287
1288                 if (related)
1289                         return verdict;
1290                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1291         }
1292
1293         /* Protocol supported? */
1294         pp = ip_vs_proto_get(iph.protocol);
1295         if (unlikely(!pp))
1296                 return NF_ACCEPT;
1297
1298         /*
1299          * Check if the packet belongs to an existing connection entry
1300          */
1301         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1302
1303         if (unlikely(!cp)) {
1304                 int v;
1305
1306                 /* For local client packets, it could be a response */
1307                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1308                 if (cp)
1309                         return handle_response(af, skb, pp, cp, iph.len);
1310
1311                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1312                         return v;
1313         }
1314
1315         if (unlikely(!cp)) {
1316                 /* sorry, all this trouble for a no-hit :) */
1317                 IP_VS_DBG_PKT(12, pp, skb, 0,
1318                               "packet continues traversal as normal");
1319                 return NF_ACCEPT;
1320         }
1321
1322         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1323
1324         /* Check the server status */
1325         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1326                 /* the destination server is not available */
1327
1328                 if (sysctl_ip_vs_expire_nodest_conn) {
1329                         /* try to expire the connection immediately */
1330                         ip_vs_conn_expire_now(cp);
1331                 }
1332                 /* don't restart its timer, and silently
1333                    drop the packet. */
1334                 __ip_vs_conn_put(cp);
1335                 return NF_DROP;
1336         }
1337
1338         ip_vs_in_stats(cp, skb);
1339         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1340         if (cp->packet_xmit)
1341                 ret = cp->packet_xmit(skb, cp, pp);
1342                 /* do not touch skb anymore */
1343         else {
1344                 IP_VS_DBG_RL("warning: packet_xmit is null");
1345                 ret = NF_ACCEPT;
1346         }
1347
1348         /* Increase its packet counter and check if it is needed
1349          * to be synchronized
1350          *
1351          * Sync connection if it is about to close to
1352          * encorage the standby servers to update the connections timeout
1353          */
1354         atomic_inc(&cp->in_pkts);
1355         if (af == AF_INET &&
1356             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1357             (((cp->protocol != IPPROTO_TCP ||
1358                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1359               (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1360                == sysctl_ip_vs_sync_threshold[0])) ||
1361              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1362               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1363                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1364                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1365                 ip_vs_sync_conn(cp);
1366         cp->old_state = cp->state;
1367
1368         ip_vs_conn_put(cp);
1369         return ret;
1370 }
1371
1372
1373 /*
1374  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1375  *      related packets destined for 0.0.0.0/0.
1376  *      When fwmark-based virtual service is used, such as transparent
1377  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1378  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1379  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1380  *      and send them to ip_vs_in_icmp.
1381  */
1382 static unsigned int
1383 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1384                    const struct net_device *in, const struct net_device *out,
1385                    int (*okfn)(struct sk_buff *))
1386 {
1387         int r;
1388
1389         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1390                 return NF_ACCEPT;
1391
1392         return ip_vs_in_icmp(skb, &r, hooknum);
1393 }
1394
1395 #ifdef CONFIG_IP_VS_IPV6
1396 static unsigned int
1397 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1398                       const struct net_device *in, const struct net_device *out,
1399                       int (*okfn)(struct sk_buff *))
1400 {
1401         int r;
1402
1403         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1404                 return NF_ACCEPT;
1405
1406         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1407 }
1408 #endif
1409
1410
1411 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1412         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1413          * or VS/NAT(change destination), so that filtering rules can be
1414          * applied to IPVS. */
1415         {
1416                 .hook           = ip_vs_in,
1417                 .owner          = THIS_MODULE,
1418                 .pf             = PF_INET,
1419                 .hooknum        = NF_INET_LOCAL_IN,
1420                 .priority       = 100,
1421         },
1422         /* After packet filtering, change source only for VS/NAT */
1423         {
1424                 .hook           = ip_vs_out,
1425                 .owner          = THIS_MODULE,
1426                 .pf             = PF_INET,
1427                 .hooknum        = NF_INET_FORWARD,
1428                 .priority       = 100,
1429         },
1430         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1431          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1432         {
1433                 .hook           = ip_vs_forward_icmp,
1434                 .owner          = THIS_MODULE,
1435                 .pf             = PF_INET,
1436                 .hooknum        = NF_INET_FORWARD,
1437                 .priority       = 99,
1438         },
1439         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1440         {
1441                 .hook           = ip_vs_post_routing,
1442                 .owner          = THIS_MODULE,
1443                 .pf             = PF_INET,
1444                 .hooknum        = NF_INET_POST_ROUTING,
1445                 .priority       = NF_IP_PRI_NAT_SRC-1,
1446         },
1447 #ifdef CONFIG_IP_VS_IPV6
1448         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1449          * or VS/NAT(change destination), so that filtering rules can be
1450          * applied to IPVS. */
1451         {
1452                 .hook           = ip_vs_in,
1453                 .owner          = THIS_MODULE,
1454                 .pf             = PF_INET6,
1455                 .hooknum        = NF_INET_LOCAL_IN,
1456                 .priority       = 100,
1457         },
1458         /* After packet filtering, change source only for VS/NAT */
1459         {
1460                 .hook           = ip_vs_out,
1461                 .owner          = THIS_MODULE,
1462                 .pf             = PF_INET6,
1463                 .hooknum        = NF_INET_FORWARD,
1464                 .priority       = 100,
1465         },
1466         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1467          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1468         {
1469                 .hook           = ip_vs_forward_icmp_v6,
1470                 .owner          = THIS_MODULE,
1471                 .pf             = PF_INET6,
1472                 .hooknum        = NF_INET_FORWARD,
1473                 .priority       = 99,
1474         },
1475         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1476         {
1477                 .hook           = ip_vs_post_routing,
1478                 .owner          = THIS_MODULE,
1479                 .pf             = PF_INET6,
1480                 .hooknum        = NF_INET_POST_ROUTING,
1481                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1482         },
1483 #endif
1484 };
1485
1486
1487 /*
1488  *      Initialize IP Virtual Server
1489  */
1490 static int __init ip_vs_init(void)
1491 {
1492         int ret;
1493
1494         ip_vs_estimator_init();
1495
1496         ret = ip_vs_control_init();
1497         if (ret < 0) {
1498                 IP_VS_ERR("can't setup control.\n");
1499                 goto cleanup_estimator;
1500         }
1501
1502         ip_vs_protocol_init();
1503
1504         ret = ip_vs_app_init();
1505         if (ret < 0) {
1506                 IP_VS_ERR("can't setup application helper.\n");
1507                 goto cleanup_protocol;
1508         }
1509
1510         ret = ip_vs_conn_init();
1511         if (ret < 0) {
1512                 IP_VS_ERR("can't setup connection table.\n");
1513                 goto cleanup_app;
1514         }
1515
1516         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1517         if (ret < 0) {
1518                 IP_VS_ERR("can't register hooks.\n");
1519                 goto cleanup_conn;
1520         }
1521
1522         IP_VS_INFO("ipvs loaded.\n");
1523         return ret;
1524
1525   cleanup_conn:
1526         ip_vs_conn_cleanup();
1527   cleanup_app:
1528         ip_vs_app_cleanup();
1529   cleanup_protocol:
1530         ip_vs_protocol_cleanup();
1531         ip_vs_control_cleanup();
1532   cleanup_estimator:
1533         ip_vs_estimator_cleanup();
1534         return ret;
1535 }
1536
1537 static void __exit ip_vs_cleanup(void)
1538 {
1539         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1540         ip_vs_conn_cleanup();
1541         ip_vs_app_cleanup();
1542         ip_vs_protocol_cleanup();
1543         ip_vs_control_cleanup();
1544         ip_vs_estimator_cleanup();
1545         IP_VS_INFO("ipvs unloaded.\n");
1546 }
1547
1548 module_init(ip_vs_init);
1549 module_exit(ip_vs_cleanup);
1550 MODULE_LICENSE("GPL");