net/ipv4/ipvs/ip_vs_core.c

   1 /*
   2  * IPVS         An implementation of the IP virtual server support for the
   3  *              LINUX operating system.  IPVS is now implemented as a module
   4  *              over the Netfilter framework. IPVS can be used to build a
   5  *              high-performance and highly available server based on a
   6  *              cluster of servers.
   7  *
   8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9  *              Peter Kese <peter.kese@ijs.si>
  10  *              Julian Anastasov <ja@ssi.bg>
  11  *
  12  *              This program is free software; you can redistribute it and/or
  13  *              modify it under the terms of the GNU General Public License
  14  *              as published by the Free Software Foundation; either version
  15  *              2 of the License, or (at your option) any later version.
  16  *
  17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19  * and others.
  20  *
  21  * Changes:
  22  *      Paul `Rusty' Russell            properly handle non-linear skbs
  23  *      Harald Welte                    don't use nfcache
  24  *
  25  */
  26
  27 #include <linux/module.h>
  28 #include <linux/kernel.h>
  29 #include <linux/ip.h>
  30 #include <linux/tcp.h>
  31 #include <linux/icmp.h>
  32
  33 #include <net/ip.h>
  34 #include <net/tcp.h>
  35 #include <net/udp.h>
  36 #include <net/icmp.h>                   /* for icmp_send */
  37 #include <net/route.h>
  38
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv4.h>
  41
  42 #ifdef CONFIG_IP_VS_IPV6
  43 #include <net/ipv6.h>
  44 #include <linux/netfilter_ipv6.h>
  45 #endif
  46
  47 #include <net/ip_vs.h>
  48
  49
  50 EXPORT_SYMBOL(register_ip_vs_scheduler);
  51 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  52 EXPORT_SYMBOL(ip_vs_skb_replace);
  53 EXPORT_SYMBOL(ip_vs_proto_name);
  54 EXPORT_SYMBOL(ip_vs_conn_new);
  55 EXPORT_SYMBOL(ip_vs_conn_in_get);
  56 EXPORT_SYMBOL(ip_vs_conn_out_get);
  57 #ifdef CONFIG_IP_VS_PROTO_TCP
  58 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  59 #endif
  60 EXPORT_SYMBOL(ip_vs_conn_put);
  61 #ifdef CONFIG_IP_VS_DEBUG
  62 EXPORT_SYMBOL(ip_vs_get_debug_level);
  63 #endif
  64
  65
  66 /* ID used in ICMP lookups */
  67 #define icmp_id(icmph)          (((icmph)->un).echo.id)
  68 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
  69
  70 const char *ip_vs_proto_name(unsigned proto)
  71 {
  72         static char buf[20];
  73
  74         switch (proto) {
  75         case IPPROTO_IP:
  76                 return "IP";
  77         case IPPROTO_UDP:
  78                 return "UDP";
  79         case IPPROTO_TCP:
  80                 return "TCP";
  81         case IPPROTO_ICMP:
  82                 return "ICMP";
  83 #ifdef CONFIG_IP_VS_IPV6
  84         case IPPROTO_ICMPV6:
  85                 return "ICMPv6";
  86 #endif
  87         default:
  88                 sprintf(buf, "IP_%d", proto);
  89                 return buf;
  90         }
  91 }
  92
  93 void ip_vs_init_hash_table(struct list_head *table, int rows)
  94 {
  95         while (--rows >= 0)
  96                 INIT_LIST_HEAD(&table[rows]);
  97 }
  98
  99 static inline void
 100 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 101 {
 102         struct ip_vs_dest *dest = cp->dest;
 103         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 104                 spin_lock(&dest->stats.lock);
 105                 dest->stats.inpkts++;
 106                 dest->stats.inbytes += skb->len;
 107                 spin_unlock(&dest->stats.lock);
 108
 109                 spin_lock(&dest->svc->stats.lock);
 110                 dest->svc->stats.inpkts++;
 111                 dest->svc->stats.inbytes += skb->len;
 112                 spin_unlock(&dest->svc->stats.lock);
 113
 114                 spin_lock(&ip_vs_stats.lock);
 115                 ip_vs_stats.inpkts++;
 116                 ip_vs_stats.inbytes += skb->len;
 117                 spin_unlock(&ip_vs_stats.lock);
 118         }
 119 }
 120
 121
 122 static inline void
 123 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 124 {
 125         struct ip_vs_dest *dest = cp->dest;
 126         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 127                 spin_lock(&dest->stats.lock);
 128                 dest->stats.outpkts++;
 129                 dest->stats.outbytes += skb->len;
 130                 spin_unlock(&dest->stats.lock);
 131
 132                 spin_lock(&dest->svc->stats.lock);
 133                 dest->svc->stats.outpkts++;
 134                 dest->svc->stats.outbytes += skb->len;
 135                 spin_unlock(&dest->svc->stats.lock);
 136
 137                 spin_lock(&ip_vs_stats.lock);
 138                 ip_vs_stats.outpkts++;
 139                 ip_vs_stats.outbytes += skb->len;
 140                 spin_unlock(&ip_vs_stats.lock);
 141         }
 142 }
 143
 144
 145 static inline void
 146 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 147 {
 148         spin_lock(&cp->dest->stats.lock);
 149         cp->dest->stats.conns++;
 150         spin_unlock(&cp->dest->stats.lock);
 151
 152         spin_lock(&svc->stats.lock);
 153         svc->stats.conns++;
 154         spin_unlock(&svc->stats.lock);
 155
 156         spin_lock(&ip_vs_stats.lock);
 157         ip_vs_stats.conns++;
 158         spin_unlock(&ip_vs_stats.lock);
 159 }
 160
 161
 162 static inline int
 163 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 164                 const struct sk_buff *skb,
 165                 struct ip_vs_protocol *pp)
 166 {
 167         if (unlikely(!pp->state_transition))
 168                 return 0;
 169         return pp->state_transition(cp, direction, skb, pp);
 170 }
 171
 172
 173 /*
 174  *  IPVS persistent scheduling function
 175  *  It creates a connection entry according to its template if exists,
 176  *  or selects a server and creates a connection entry plus a template.
 177  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 178  *  Protocols supported: TCP, UDP
 179  */
 180 static struct ip_vs_conn *
 181 ip_vs_sched_persist(struct ip_vs_service *svc,
 182                     const struct sk_buff *skb,
 183                     __be16 ports[2])
 184 {
 185         struct ip_vs_conn *cp = NULL;
 186         struct ip_vs_iphdr iph;
 187         struct ip_vs_dest *dest;
 188         struct ip_vs_conn *ct;
 189         __be16  dport;                  /* destination port to forward */
 190         union nf_inet_addr snet;        /* source network of the client,
 191                                            after masking */
 192
 193         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 194
 195         /* Mask saddr with the netmask to adjust template granularity */
 196 #ifdef CONFIG_IP_VS_IPV6
 197         if (svc->af == AF_INET6)
 198                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
 199         else
 200 #endif
 201                 snet.ip = iph.saddr.ip & svc->netmask;
 202
 203         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 204                       "mnet %s\n",
 205                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
 206                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
 207                       IP_VS_DBG_ADDR(svc->af, &snet));
 208
 209         /*
 210          * As far as we know, FTP is a very complicated network protocol, and
 211          * it uses control connection and data connections. For active FTP,
 212          * FTP server initialize data connection to the client, its source port
 213          * is often 20. For passive FTP, FTP server tells the clients the port
 214          * that it passively listens to,  and the client issues the data
 215          * connection. In the tunneling or direct routing mode, the load
 216          * balancer is on the client-to-server half of connection, the port
 217          * number is unknown to the load balancer. So, a conn template like
 218          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 219          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 220          * is created for other persistent services.
 221          */
 222         if (ports[1] == svc->port) {
 223                 /* Check if a template already exists */
 224                 if (svc->port != FTPPORT)
 225                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 226                                              &iph.daddr, ports[1]);
 227                 else
 228                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 229                                              &iph.daddr, 0);
 230
 231                 if (!ct || !ip_vs_check_template(ct)) {
 232                         /*
 233                          * No template found or the dest of the connection
 234                          * template is not available.
 235                          */
 236                         dest = svc->scheduler->schedule(svc, skb);
 237                         if (dest == NULL) {
 238                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 239                                 return NULL;
 240                         }
 241
 242                         /*
 243                          * Create a template like <protocol,caddr,0,
 244                          * vaddr,vport,daddr,dport> for non-ftp service,
 245                          * and <protocol,caddr,0,vaddr,0,daddr,0>
 246                          * for ftp service.
 247                          */
 248                         if (svc->port != FTPPORT)
 249                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 250                                                     &snet, 0,
 251                                                     &iph.daddr,
 252                                                     ports[1],
 253                                                     &dest->addr, dest->port,
 254                                                     IP_VS_CONN_F_TEMPLATE,
 255                                                     dest);
 256                         else
 257                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 258                                                     &snet, 0,
 259                                                     &iph.daddr, 0,
 260                                                     &dest->addr, 0,
 261                                                     IP_VS_CONN_F_TEMPLATE,
 262                                                     dest);
 263                         if (ct == NULL)
 264                                 return NULL;
 265
 266                         ct->timeout = svc->timeout;
 267                 } else {
 268                         /* set destination with the found template */
 269                         dest = ct->dest;
 270                 }
 271                 dport = dest->port;
 272         } else {
 273                 /*
 274                  * Note: persistent fwmark-based services and persistent
 275                  * port zero service are handled here.
 276                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 277                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 278                  */
 279                 if (svc->fwmark) {
 280                         union nf_inet_addr fwmark = {
 281                                 .all = { 0, 0, 0, htonl(svc->fwmark) }
 282                         };
 283
 284                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
 285                                              &fwmark, 0);
 286                 } else
 287                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 288                                              &iph.daddr, 0);
 289
 290                 if (!ct || !ip_vs_check_template(ct)) {
 291                         /*
 292                          * If it is not persistent port zero, return NULL,
 293                          * otherwise create a connection template.
 294                          */
 295                         if (svc->port)
 296                                 return NULL;
 297
 298                         dest = svc->scheduler->schedule(svc, skb);
 299                         if (dest == NULL) {
 300                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 301                                 return NULL;
 302                         }
 303
 304                         /*
 305                          * Create a template according to the service
 306                          */
 307                         if (svc->fwmark) {
 308                                 union nf_inet_addr fwmark = {
 309                                         .all = { 0, 0, 0, htonl(svc->fwmark) }
 310                                 };
 311
 312                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
 313                                                     &snet, 0,
 314                                                     &fwmark, 0,
 315                                                     &dest->addr, 0,
 316                                                     IP_VS_CONN_F_TEMPLATE,
 317                                                     dest);
 318                         } else
 319                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 320                                                     &snet, 0,
 321                                                     &iph.daddr, 0,
 322                                                     &dest->addr, 0,
 323                                                     IP_VS_CONN_F_TEMPLATE,
 324                                                     dest);
 325                         if (ct == NULL)
 326                                 return NULL;
 327
 328                         ct->timeout = svc->timeout;
 329                 } else {
 330                         /* set destination with the found template */
 331                         dest = ct->dest;
 332                 }
 333                 dport = ports[1];
 334         }
 335
 336         /*
 337          *    Create a new connection according to the template
 338          */
 339         cp = ip_vs_conn_new(svc->af, iph.protocol,
 340                             &iph.saddr, ports[0],
 341                             &iph.daddr, ports[1],
 342                             &dest->addr, dport,
 343                             0,
 344                             dest);
 345         if (cp == NULL) {
 346                 ip_vs_conn_put(ct);
 347                 return NULL;
 348         }
 349
 350         /*
 351          *    Add its control
 352          */
 353         ip_vs_control_add(cp, ct);
 354         ip_vs_conn_put(ct);
 355
 356         ip_vs_conn_stats(cp, svc);
 357         return cp;
 358 }
 359
 360
 361 /*
 362  *  IPVS main scheduling function
 363  *  It selects a server according to the virtual service, and
 364  *  creates a connection entry.
 365  *  Protocols supported: TCP, UDP
 366  */
 367 struct ip_vs_conn *
 368 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 369 {
 370         struct ip_vs_conn *cp = NULL;
 371         struct ip_vs_iphdr iph;
 372         struct ip_vs_dest *dest;
 373         __be16 _ports[2], *pptr;
 374
 375         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 376         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 377         if (pptr == NULL)
 378                 return NULL;
 379
 380         /*
 381          *    Persistent service
 382          */
 383         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 384                 return ip_vs_sched_persist(svc, skb, pptr);
 385
 386         /*
 387          *    Non-persistent service
 388          */
 389         if (!svc->fwmark && pptr[1] != svc->port) {
 390                 if (!svc->port)
 391                         IP_VS_ERR("Schedule: port zero only supported "
 392                                   "in persistent services, "
 393                                   "check your ipvs configuration\n");
 394                 return NULL;
 395         }
 396
 397         dest = svc->scheduler->schedule(svc, skb);
 398         if (dest == NULL) {
 399                 IP_VS_DBG(1, "Schedule: no dest found.\n");
 400                 return NULL;
 401         }
 402
 403         /*
 404          *    Create a connection entry.
 405          */
 406         cp = ip_vs_conn_new(svc->af, iph.protocol,
 407                             &iph.saddr, pptr[0],
 408                             &iph.daddr, pptr[1],
 409                             &dest->addr, dest->port ? dest->port : pptr[1],
 410                             0,
 411                             dest);
 412         if (cp == NULL)
 413                 return NULL;
 414
 415         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 416                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 417                       ip_vs_fwd_tag(cp),
 418                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
 419                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
 420                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
 421                       cp->flags, atomic_read(&cp->refcnt));
 422
 423         ip_vs_conn_stats(cp, svc);
 424         return cp;
 425 }
 426
 427
 428 /*
 429  *  Pass or drop the packet.
 430  *  Called by ip_vs_in, when the virtual service is available but
 431  *  no destination is available for a new connection.
 432  */
 433 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 434                 struct ip_vs_protocol *pp)
 435 {
 436         __be16 _ports[2], *pptr;
 437         struct ip_vs_iphdr iph;
 438         int unicast;
 439         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 440
 441         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 442         if (pptr == NULL) {
 443                 ip_vs_service_put(svc);
 444                 return NF_DROP;
 445         }
 446
 447 #ifdef CONFIG_IP_VS_IPV6
 448         if (svc->af == AF_INET6)
 449                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 450         else
 451 #endif
 452                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
 453
 454         /* if it is fwmark-based service, the cache_bypass sysctl is up
 455            and the destination is a non-local unicast, then create
 456            a cache_bypass connection entry */
 457         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 458                 int ret, cs;
 459                 struct ip_vs_conn *cp;
 460
 461                 ip_vs_service_put(svc);
 462
 463                 /* create a new connection entry */
 464                 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
 465                 cp = ip_vs_conn_new(svc->af, iph.protocol,
 466                                     &iph.saddr, pptr[0],
 467                                     &iph.daddr, pptr[1],
 468                                     0, 0,
 469                                     IP_VS_CONN_F_BYPASS,
 470                                     NULL);
 471                 if (cp == NULL)
 472                         return NF_DROP;
 473
 474                 /* statistics */
 475                 ip_vs_in_stats(cp, skb);
 476
 477                 /* set state */
 478                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 479
 480                 /* transmit the first SYN packet */
 481                 ret = cp->packet_xmit(skb, cp, pp);
 482                 /* do not touch skb anymore */
 483
 484                 atomic_inc(&cp->in_pkts);
 485                 ip_vs_conn_put(cp);
 486                 return ret;
 487         }
 488
 489         /*
 490          * When the virtual ftp service is presented, packets destined
 491          * for other services on the VIP may get here (except services
 492          * listed in the ipvs table), pass the packets, because it is
 493          * not ipvs job to decide to drop the packets.
 494          */
 495         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 496                 ip_vs_service_put(svc);
 497                 return NF_ACCEPT;
 498         }
 499
 500         ip_vs_service_put(svc);
 501
 502         /*
 503          * Notify the client that the destination is unreachable, and
 504          * release the socket buffer.
 505          * Since it is in IP layer, the TCP socket is not actually
 506          * created, the TCP RST packet cannot be sent, instead that
 507          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 508          */
 509 #ifdef CONFIG_IP_VS_IPV6
 510         if (svc->af == AF_INET6)
 511                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
 512                             skb->dev);
 513         else
 514 #endif
 515                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 516
 517         return NF_DROP;
 518 }
 519
 520
 521 /*
 522  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
 523  *      chain, and is used for VS/NAT.
 524  *      It detects packets for VS/NAT connections and sends the packets
 525  *      immediately. This can avoid that iptable_nat mangles the packets
 526  *      for VS/NAT.
 527  */
 528 static unsigned int ip_vs_post_routing(unsigned int hooknum,
 529                                        struct sk_buff *skb,
 530                                        const struct net_device *in,
 531                                        const struct net_device *out,
 532                                        int (*okfn)(struct sk_buff *))
 533 {
 534         if (!skb->ipvs_property)
 535                 return NF_ACCEPT;
 536         /* The packet was sent from IPVS, exit this chain */
 537         return NF_STOP;
 538 }
 539
 540 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 541 {
 542         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 543 }
 544
 545 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 546 {
 547         int err = ip_defrag(skb, user);
 548
 549         if (!err)
 550                 ip_send_check(ip_hdr(skb));
 551
 552         return err;
 553 }
 554
 555 #ifdef CONFIG_IP_VS_IPV6
 556 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
 557 {
 558         /* TODO IPv6: Find out what to do here for IPv6 */
 559         return 0;
 560 }
 561 #endif
 562
 563 /*
 564  * Packet has been made sufficiently writable in caller
 565  * - inout: 1=in->out, 0=out->in
 566  */
 567 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 568                     struct ip_vs_conn *cp, int inout)
 569 {
 570         struct iphdr *iph        = ip_hdr(skb);
 571         unsigned int icmp_offset = iph->ihl*4;
 572         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 573                                                       icmp_offset);
 574         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 575
 576         if (inout) {
 577                 iph->saddr = cp->vaddr.ip;
 578                 ip_send_check(iph);
 579                 ciph->daddr = cp->vaddr.ip;
 580                 ip_send_check(ciph);
 581         } else {
 582                 iph->daddr = cp->daddr.ip;
 583                 ip_send_check(iph);
 584                 ciph->saddr = cp->daddr.ip;
 585                 ip_send_check(ciph);
 586         }
 587
 588         /* the TCP/UDP port */
 589         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
 590                 __be16 *ports = (void *)ciph + ciph->ihl*4;
 591
 592                 if (inout)
 593                         ports[1] = cp->vport;
 594                 else
 595                         ports[0] = cp->dport;
 596         }
 597
 598         /* And finally the ICMP checksum */
 599         icmph->checksum = 0;
 600         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 601         skb->ip_summed = CHECKSUM_UNNECESSARY;
 602
 603         if (inout)
 604                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 605                         "Forwarding altered outgoing ICMP");
 606         else
 607                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 608                         "Forwarding altered incoming ICMP");
 609 }
 610
 611 #ifdef CONFIG_IP_VS_IPV6
 612 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 613                     struct ip_vs_conn *cp, int inout)
 614 {
 615         struct ipv6hdr *iph      = ipv6_hdr(skb);
 616         unsigned int icmp_offset = sizeof(struct ipv6hdr);
 617         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
 618                                                       icmp_offset);
 619         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
 620
 621         if (inout) {
 622                 iph->saddr = cp->vaddr.in6;
 623                 ciph->daddr = cp->vaddr.in6;
 624         } else {
 625                 iph->daddr = cp->daddr.in6;
 626                 ciph->saddr = cp->daddr.in6;
 627         }
 628
 629         /* the TCP/UDP port */
 630         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
 631                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
 632
 633                 if (inout)
 634                         ports[1] = cp->vport;
 635                 else
 636                         ports[0] = cp->dport;
 637         }
 638
 639         /* And finally the ICMP checksum */
 640         icmph->icmp6_cksum = 0;
 641         /* TODO IPv6: is this correct for ICMPv6? */
 642         ip_vs_checksum_complete(skb, icmp_offset);
 643         skb->ip_summed = CHECKSUM_UNNECESSARY;
 644
 645         if (inout)
 646                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 647                         "Forwarding altered outgoing ICMPv6");
 648         else
 649                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 650                         "Forwarding altered incoming ICMPv6");
 651 }
 652 #endif
 653
 654 /* Handle relevant response ICMP messages - forward to the right
 655  * destination host. Used for NAT and local client.
 656  */
 657 static int handle_response_icmp(int af, struct sk_buff *skb,
 658                                 union nf_inet_addr *snet,
 659                                 __u8 protocol, struct ip_vs_conn *cp,
 660                                 struct ip_vs_protocol *pp,
 661                                 unsigned int offset, unsigned int ihl)
 662 {
 663         unsigned int verdict = NF_DROP;
 664
 665         if (IP_VS_FWD_METHOD(cp) != 0) {
 666                 IP_VS_ERR("shouldn't reach here, because the box is on the "
 667                           "half connection in the tun/dr module.\n");
 668         }
 669
 670         /* Ensure the checksum is correct */
 671         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 672                 /* Failed checksum! */
 673                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 674                               IP_VS_DBG_ADDR(af, snet));
 675                 goto out;
 676         }
 677
 678         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
 679                 offset += 2 * sizeof(__u16);
 680         if (!skb_make_writable(skb, offset))
 681                 goto out;
 682
 683 #ifdef CONFIG_IP_VS_IPV6
 684         if (af == AF_INET6)
 685                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 686         else
 687 #endif
 688                 ip_vs_nat_icmp(skb, pp, cp, 1);
 689
 690         /* do the statistics and put it back */
 691         ip_vs_out_stats(cp, skb);
 692
 693         skb->ipvs_property = 1;
 694         verdict = NF_ACCEPT;
 695
 696 out:
 697         __ip_vs_conn_put(cp);
 698
 699         return verdict;
 700 }
 701
 702 /*
 703  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 704  *      Find any that might be relevant, check against existing connections.
 705  *      Currently handles error types - unreachable, quench, ttl exceeded.
 706  */
 707 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 708 {
 709         struct iphdr *iph;
 710         struct icmphdr  _icmph, *ic;
 711         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 712         struct ip_vs_iphdr ciph;
 713         struct ip_vs_conn *cp;
 714         struct ip_vs_protocol *pp;
 715         unsigned int offset, ihl;
 716         union nf_inet_addr snet;
 717
 718         *related = 1;
 719
 720         /* reassemble IP fragments */
 721         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 722                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 723                         return NF_STOLEN;
 724         }
 725
 726         iph = ip_hdr(skb);
 727         offset = ihl = iph->ihl * 4;
 728         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 729         if (ic == NULL)
 730                 return NF_DROP;
 731
 732         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
 733                   ic->type, ntohs(icmp_id(ic)),
 734                   NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
 735
 736         /*
 737          * Work through seeing if this is for us.
 738          * These checks are supposed to be in an order that means easy
 739          * things are checked first to speed up processing.... however
 740          * this means that some packets will manage to get a long way
 741          * down this stack and then be rejected, but that's life.
 742          */
 743         if ((ic->type != ICMP_DEST_UNREACH) &&
 744             (ic->type != ICMP_SOURCE_QUENCH) &&
 745             (ic->type != ICMP_TIME_EXCEEDED)) {
 746                 *related = 0;
 747                 return NF_ACCEPT;
 748         }
 749
 750         /* Now find the contained IP header */
 751         offset += sizeof(_icmph);
 752         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 753         if (cih == NULL)
 754                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 755
 756         pp = ip_vs_proto_get(cih->protocol);
 757         if (!pp)
 758                 return NF_ACCEPT;
 759
 760         /* Is the embedded protocol header present? */
 761         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 762                      pp->dont_defrag))
 763                 return NF_ACCEPT;
 764
 765         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
 766
 767         offset += cih->ihl * 4;
 768
 769         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 770         /* The embedded headers contain source and dest in reverse order */
 771         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
 772         if (!cp)
 773                 return NF_ACCEPT;
 774
 775         snet.ip = iph->saddr;
 776         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 777                                     pp, offset, ihl);
 778 }
 779
 780 #ifdef CONFIG_IP_VS_IPV6
 781 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 782 {
 783         struct ipv6hdr *iph;
 784         struct icmp6hdr _icmph, *ic;
 785         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
 786                                            within the ICMP */
 787         struct ip_vs_iphdr ciph;
 788         struct ip_vs_conn *cp;
 789         struct ip_vs_protocol *pp;
 790         unsigned int offset;
 791         union nf_inet_addr snet;
 792
 793         *related = 1;
 794
 795         /* reassemble IP fragments */
 796         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
 797                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
 798                         return NF_STOLEN;
 799         }
 800
 801         iph = ipv6_hdr(skb);
 802         offset = sizeof(struct ipv6hdr);
 803         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 804         if (ic == NULL)
 805                 return NF_DROP;
 806
 807         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
 808                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
 809                   NIP6(iph->saddr), NIP6(iph->daddr));
 810
 811         /*
 812          * Work through seeing if this is for us.
 813          * These checks are supposed to be in an order that means easy
 814          * things are checked first to speed up processing.... however
 815          * this means that some packets will manage to get a long way
 816          * down this stack and then be rejected, but that's life.
 817          */
 818         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
 819             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
 820             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
 821                 *related = 0;
 822                 return NF_ACCEPT;
 823         }
 824
 825         /* Now find the contained IP header */
 826         offset += sizeof(_icmph);
 827         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 828         if (cih == NULL)
 829                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 830
 831         pp = ip_vs_proto_get(cih->nexthdr);
 832         if (!pp)
 833                 return NF_ACCEPT;
 834
 835         /* Is the embedded protocol header present? */
 836         /* TODO: we don't support fragmentation at the moment anyways */
 837         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 838                 return NF_ACCEPT;
 839
 840         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
 841
 842         offset += sizeof(struct ipv6hdr);
 843
 844         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 845         /* The embedded headers contain source and dest in reverse order */
 846         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
 847         if (!cp)
 848                 return NF_ACCEPT;
 849
 850         snet.in6 = iph->saddr;
 851         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 852                                     pp, offset, sizeof(struct ipv6hdr));
 853 }
 854 #endif
 855
 856 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 857 {
 858         struct tcphdr _tcph, *th;
 859
 860         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
 861         if (th == NULL)
 862                 return 0;
 863         return th->rst;
 864 }
 865
 866 /* Handle response packets: rewrite addresses and send away...
 867  * Used for NAT and local client.
 868  */
 869 static unsigned int
 870 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 871                 struct ip_vs_conn *cp, int ihl)
 872 {
 873         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 874
 875         if (!skb_make_writable(skb, ihl))
 876                 goto drop;
 877
 878         /* mangle the packet */
 879         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 880                 goto drop;
 881
 882 #ifdef CONFIG_IP_VS_IPV6
 883         if (af == AF_INET6)
 884                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
 885         else
 886 #endif
 887         {
 888                 ip_hdr(skb)->saddr = cp->vaddr.ip;
 889                 ip_send_check(ip_hdr(skb));
 890         }
 891
 892         /* For policy routing, packets originating from this
 893          * machine itself may be routed differently to packets
 894          * passing through.  We want this packet to be routed as
 895          * if it came from this machine itself.  So re-compute
 896          * the routing information.
 897          */
 898 #ifdef CONFIG_IP_VS_IPV6
 899         if (af == AF_INET6) {
 900                 if (ip6_route_me_harder(skb) != 0)
 901                         goto drop;
 902         } else
 903 #endif
 904                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 905                         goto drop;
 906
 907         /* For policy routing, packets originating from this
 908          * machine itself may be routed differently to packets
 909          * passing through.  We want this packet to be routed as
 910          * if it came from this machine itself.  So re-compute
 911          * the routing information.
 912          */
 913         if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 914                 goto drop;
 915
 916         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 917
 918         ip_vs_out_stats(cp, skb);
 919         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 920         ip_vs_conn_put(cp);
 921
 922         skb->ipvs_property = 1;
 923
 924         LeaveFunction(11);
 925         return NF_ACCEPT;
 926
 927 drop:
 928         ip_vs_conn_put(cp);
 929         kfree_skb(skb);
 930         return NF_STOLEN;
 931 }
 932
 933 /*
 934  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
 935  *      Check if outgoing packet belongs to the established ip_vs_conn.
 936  */
 937 static unsigned int
 938 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 939           const struct net_device *in, const struct net_device *out,
 940           int (*okfn)(struct sk_buff *))
 941 {
 942         struct ip_vs_iphdr iph;
 943         struct ip_vs_protocol *pp;
 944         struct ip_vs_conn *cp;
 945         int af;
 946
 947         EnterFunction(11);
 948
 949         af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 950
 951         if (skb->ipvs_property)
 952                 return NF_ACCEPT;
 953
 954         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 955 #ifdef CONFIG_IP_VS_IPV6
 956         if (af == AF_INET6) {
 957                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 958                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 959
 960                         if (related)
 961                                 return verdict;
 962                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 963                 }
 964         } else
 965 #endif
 966                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 967                         int related, verdict = ip_vs_out_icmp(skb, &related);
 968
 969                         if (related)
 970                                 return verdict;
 971                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 972                 }
 973
 974         pp = ip_vs_proto_get(iph.protocol);
 975         if (unlikely(!pp))
 976                 return NF_ACCEPT;
 977
 978         /* reassemble IP fragments */
 979 #ifdef CONFIG_IP_VS_IPV6
 980         if (af == AF_INET6) {
 981                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 982                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 983
 984                         if (related)
 985                                 return verdict;
 986
 987                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 988                 }
 989         } else
 990 #endif
 991                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
 992                              !pp->dont_defrag)) {
 993                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 994                                 return NF_STOLEN;
 995
 996                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 997                 }
 998
 999         /*
1000          * Check if the packet belongs to an existing entry
1001          */
1002         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1003
1004         if (unlikely(!cp)) {
1005                 if (sysctl_ip_vs_nat_icmp_send &&
1006                     (pp->protocol == IPPROTO_TCP ||
1007                      pp->protocol == IPPROTO_UDP)) {
1008                         __be16 _ports[2], *pptr;
1009
1010                         pptr = skb_header_pointer(skb, iph.len,
1011                                                   sizeof(_ports), _ports);
1012                         if (pptr == NULL)
1013                                 return NF_ACCEPT;       /* Not for me */
1014                         if (ip_vs_lookup_real_service(af, iph.protocol,
1015                                                       &iph.saddr,
1016                                                       pptr[0])) {
1017                                 /*
1018                                  * Notify the real server: there is no
1019                                  * existing entry if it is not RST
1020                                  * packet or not TCP packet.
1021                                  */
1022                                 if (iph.protocol != IPPROTO_TCP
1023                                     || !is_tcp_reset(skb, iph.len)) {
1024 #ifdef CONFIG_IP_VS_IPV6
1025                                         if (af == AF_INET6)
1026                                                 icmpv6_send(skb,
1027                                                             ICMPV6_DEST_UNREACH,
1028                                                             ICMPV6_PORT_UNREACH,
1029                                                             0, skb->dev);
1030                                         else
1031 #endif
1032                                                 icmp_send(skb,
1033                                                           ICMP_DEST_UNREACH,
1034                                                           ICMP_PORT_UNREACH, 0);
1035                                         return NF_DROP;
1036                                 }
1037                         }
1038                 }
1039                 IP_VS_DBG_PKT(12, pp, skb, 0,
1040                               "packet continues traversal as normal");
1041                 return NF_ACCEPT;
1042         }
1043
1044         return handle_response(af, skb, pp, cp, iph.len);
1045 }
1046
1047
1048 /*
1049  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1050  *      Find any that might be relevant, check against existing connections,
1051  *      forward to the right destination host if relevant.
1052  *      Currently handles error types - unreachable, quench, ttl exceeded.
1053  */
1054 static int
1055 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1056 {
1057         struct iphdr *iph;
1058         struct icmphdr  _icmph, *ic;
1059         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1060         struct ip_vs_iphdr ciph;
1061         struct ip_vs_conn *cp;
1062         struct ip_vs_protocol *pp;
1063         unsigned int offset, ihl, verdict;
1064         union nf_inet_addr snet;
1065
1066         *related = 1;
1067
1068         /* reassemble IP fragments */
1069         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1070                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1071                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1072                         return NF_STOLEN;
1073         }
1074
1075         iph = ip_hdr(skb);
1076         offset = ihl = iph->ihl * 4;
1077         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1078         if (ic == NULL)
1079                 return NF_DROP;
1080
1081         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
1082                   ic->type, ntohs(icmp_id(ic)),
1083                   NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1084
1085         /*
1086          * Work through seeing if this is for us.
1087          * These checks are supposed to be in an order that means easy
1088          * things are checked first to speed up processing.... however
1089          * this means that some packets will manage to get a long way
1090          * down this stack and then be rejected, but that's life.
1091          */
1092         if ((ic->type != ICMP_DEST_UNREACH) &&
1093             (ic->type != ICMP_SOURCE_QUENCH) &&
1094             (ic->type != ICMP_TIME_EXCEEDED)) {
1095                 *related = 0;
1096                 return NF_ACCEPT;
1097         }
1098
1099         /* Now find the contained IP header */
1100         offset += sizeof(_icmph);
1101         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1102         if (cih == NULL)
1103                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1104
1105         pp = ip_vs_proto_get(cih->protocol);
1106         if (!pp)
1107                 return NF_ACCEPT;
1108
1109         /* Is the embedded protocol header present? */
1110         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1111                      pp->dont_defrag))
1112                 return NF_ACCEPT;
1113
1114         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1115
1116         offset += cih->ihl * 4;
1117
1118         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1119         /* The embedded headers contain source and dest in reverse order */
1120         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1121         if (!cp) {
1122                 /* The packet could also belong to a local client */
1123                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1124                 if (cp) {
1125                         snet.ip = iph->saddr;
1126                         return handle_response_icmp(AF_INET, skb, &snet,
1127                                                     cih->protocol, cp, pp,
1128                                                     offset, ihl);
1129                 }
1130                 return NF_ACCEPT;
1131         }
1132
1133         verdict = NF_DROP;
1134
1135         /* Ensure the checksum is correct */
1136         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1137                 /* Failed checksum! */
1138                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
1139                           NIPQUAD(iph->saddr));
1140                 goto out;
1141         }
1142
1143         /* do the statistics and put it back */
1144         ip_vs_in_stats(cp, skb);
1145         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1146                 offset += 2 * sizeof(__u16);
1147         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1148         /* do not touch skb anymore */
1149
1150   out:
1151         __ip_vs_conn_put(cp);
1152
1153         return verdict;
1154 }
1155
1156 #ifdef CONFIG_IP_VS_IPV6
1157 static int
1158 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1159 {
1160         struct ipv6hdr *iph;
1161         struct icmp6hdr _icmph, *ic;
1162         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1163                                            within the ICMP */
1164         struct ip_vs_iphdr ciph;
1165         struct ip_vs_conn *cp;
1166         struct ip_vs_protocol *pp;
1167         unsigned int offset, verdict;
1168         union nf_inet_addr snet;
1169
1170         *related = 1;
1171
1172         /* reassemble IP fragments */
1173         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1174                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1175                                                IP_DEFRAG_VS_IN :
1176                                                IP_DEFRAG_VS_FWD))
1177                         return NF_STOLEN;
1178         }
1179
1180         iph = ipv6_hdr(skb);
1181         offset = sizeof(struct ipv6hdr);
1182         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1183         if (ic == NULL)
1184                 return NF_DROP;
1185
1186         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
1187                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1188                   NIP6(iph->saddr), NIP6(iph->daddr));
1189
1190         /*
1191          * Work through seeing if this is for us.
1192          * These checks are supposed to be in an order that means easy
1193          * things are checked first to speed up processing.... however
1194          * this means that some packets will manage to get a long way
1195          * down this stack and then be rejected, but that's life.
1196          */
1197         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1198             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1199             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1200                 *related = 0;
1201                 return NF_ACCEPT;
1202         }
1203
1204         /* Now find the contained IP header */
1205         offset += sizeof(_icmph);
1206         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1207         if (cih == NULL)
1208                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1209
1210         pp = ip_vs_proto_get(cih->nexthdr);
1211         if (!pp)
1212                 return NF_ACCEPT;
1213
1214         /* Is the embedded protocol header present? */
1215         /* TODO: we don't support fragmentation at the moment anyways */
1216         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1217                 return NF_ACCEPT;
1218
1219         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1220
1221         offset += sizeof(struct ipv6hdr);
1222
1223         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1224         /* The embedded headers contain source and dest in reverse order */
1225         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1226         if (!cp) {
1227                 /* The packet could also belong to a local client */
1228                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1229                 if (cp) {
1230                         snet.in6 = iph->saddr;
1231                         return handle_response_icmp(AF_INET6, skb, &snet,
1232                                                     cih->nexthdr,
1233                                                     cp, pp, offset,
1234                                                     sizeof(struct ipv6hdr));
1235                 }
1236                 return NF_ACCEPT;
1237         }
1238
1239         verdict = NF_DROP;
1240
1241         /* do the statistics and put it back */
1242         ip_vs_in_stats(cp, skb);
1243         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1244                 offset += 2 * sizeof(__u16);
1245         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1246         /* do not touch skb anymore */
1247
1248         __ip_vs_conn_put(cp);
1249
1250         return verdict;
1251 }
1252 #endif
1253
1254
1255 /*
1256  *      Check if it's for virtual services, look it up,
1257  *      and send it on its way...
1258  */
1259 static unsigned int
1260 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1261          const struct net_device *in, const struct net_device *out,
1262          int (*okfn)(struct sk_buff *))
1263 {
1264         struct ip_vs_iphdr iph;
1265         struct ip_vs_protocol *pp;
1266         struct ip_vs_conn *cp;
1267         int ret, restart, af;
1268
1269         af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1270
1271         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1272
1273         /*
1274          *      Big tappo: only PACKET_HOST, including loopback for local client
1275          *      Don't handle local packets on IPv6 for now
1276          */
1277         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1278                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1279                               skb->pkt_type,
1280                               iph.protocol,
1281                               IP_VS_DBG_ADDR(af, &iph.daddr));
1282                 return NF_ACCEPT;
1283         }
1284
1285         if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1286                 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1287
1288                 if (related)
1289                         return verdict;
1290                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1291         }
1292
1293         /* Protocol supported? */
1294         pp = ip_vs_proto_get(iph.protocol);
1295         if (unlikely(!pp))
1296                 return NF_ACCEPT;
1297
1298         /*
1299          * Check if the packet belongs to an existing connection entry
1300          */
1301         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1302
1303         if (unlikely(!cp)) {
1304                 int v;
1305
1306                 /* For local client packets, it could be a response */
1307                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1308                 if (cp)
1309                         return handle_response(af, skb, pp, cp, iph.len);
1310
1311                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1312                         return v;
1313         }
1314
1315         if (unlikely(!cp)) {
1316                 /* sorry, all this trouble for a no-hit :) */
1317                 IP_VS_DBG_PKT(12, pp, skb, 0,
1318                               "packet continues traversal as normal");
1319                 return NF_ACCEPT;
1320         }
1321
1322         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1323
1324         /* Check the server status */
1325         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1326                 /* the destination server is not available */
1327
1328                 if (sysctl_ip_vs_expire_nodest_conn) {
1329                         /* try to expire the connection immediately */
1330                         ip_vs_conn_expire_now(cp);
1331                 }
1332                 /* don't restart its timer, and silently
1333                    drop the packet. */
1334                 __ip_vs_conn_put(cp);
1335                 return NF_DROP;
1336         }
1337
1338         ip_vs_in_stats(cp, skb);
1339         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1340         if (cp->packet_xmit)
1341                 ret = cp->packet_xmit(skb, cp, pp);
1342                 /* do not touch skb anymore */
1343         else {
1344                 IP_VS_DBG_RL("warning: packet_xmit is null");
1345                 ret = NF_ACCEPT;
1346         }
1347
1348         /* Increase its packet counter and check if it is needed
1349          * to be synchronized
1350          *
1351          * Sync connection if it is about to close to
1352          * encorage the standby servers to update the connections timeout
1353          */
1354         atomic_inc(&cp->in_pkts);
1355         if (af == AF_INET &&
1356             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1357             (((cp->protocol != IPPROTO_TCP ||
1358                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1359               (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1360                == sysctl_ip_vs_sync_threshold[0])) ||
1361              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1362               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1363                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1364                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1365                 ip_vs_sync_conn(cp);
1366         cp->old_state = cp->state;
1367
1368         ip_vs_conn_put(cp);
1369         return ret;
1370 }
1371
1372
1373 /*
1374  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1375  *      related packets destined for 0.0.0.0/0.
1376  *      When fwmark-based virtual service is used, such as transparent
1377  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1378  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1379  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1380  *      and send them to ip_vs_in_icmp.
1381  */
1382 static unsigned int
1383 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1384                    const struct net_device *in, const struct net_device *out,
1385                    int (*okfn)(struct sk_buff *))
1386 {
1387         int r;
1388
1389         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1390                 return NF_ACCEPT;
1391
1392         return ip_vs_in_icmp(skb, &r, hooknum);
1393 }
1394
1395 #ifdef CONFIG_IP_VS_IPV6
1396 static unsigned int
1397 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1398                       const struct net_device *in, const struct net_device *out,
1399                       int (*okfn)(struct sk_buff *))
1400 {
1401         int r;
1402
1403         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1404                 return NF_ACCEPT;
1405
1406         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1407 }
1408 #endif
1409
1410
1411 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1412         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1413          * or VS/NAT(change destination), so that filtering rules can be
1414          * applied to IPVS. */
1415         {
1416                 .hook           = ip_vs_in,
1417                 .owner          = THIS_MODULE,
1418                 .pf             = PF_INET,
1419                 .hooknum        = NF_INET_LOCAL_IN,
1420                 .priority       = 100,
1421         },
1422         /* After packet filtering, change source only for VS/NAT */
1423         {
1424                 .hook           = ip_vs_out,
1425                 .owner          = THIS_MODULE,
1426                 .pf             = PF_INET,
1427                 .hooknum        = NF_INET_FORWARD,
1428                 .priority       = 100,
1429         },
1430         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1431          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1432         {
1433                 .hook           = ip_vs_forward_icmp,
1434                 .owner          = THIS_MODULE,
1435                 .pf             = PF_INET,
1436                 .hooknum        = NF_INET_FORWARD,
1437                 .priority       = 99,
1438         },
1439         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1440         {
1441                 .hook           = ip_vs_post_routing,
1442                 .owner          = THIS_MODULE,
1443                 .pf             = PF_INET,
1444                 .hooknum        = NF_INET_POST_ROUTING,
1445                 .priority       = NF_IP_PRI_NAT_SRC-1,
1446         },
1447 #ifdef CONFIG_IP_VS_IPV6
1448         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1449          * or VS/NAT(change destination), so that filtering rules can be
1450          * applied to IPVS. */
1451         {
1452                 .hook           = ip_vs_in,
1453                 .owner          = THIS_MODULE,
1454                 .pf             = PF_INET6,
1455                 .hooknum        = NF_INET_LOCAL_IN,
1456                 .priority       = 100,
1457         },
1458         /* After packet filtering, change source only for VS/NAT */
1459         {
1460                 .hook           = ip_vs_out,
1461                 .owner          = THIS_MODULE,
1462                 .pf             = PF_INET6,
1463                 .hooknum        = NF_INET_FORWARD,
1464                 .priority       = 100,
1465         },
1466         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1467          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1468         {
1469                 .hook           = ip_vs_forward_icmp_v6,
1470                 .owner          = THIS_MODULE,
1471                 .pf             = PF_INET6,
1472                 .hooknum        = NF_INET_FORWARD,
1473                 .priority       = 99,
1474         },
1475         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1476         {
1477                 .hook           = ip_vs_post_routing,
1478                 .owner          = THIS_MODULE,
1479                 .pf             = PF_INET6,
1480                 .hooknum        = NF_INET_POST_ROUTING,
1481                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1482         },
1483 #endif
1484 };
1485
1486
1487 /*
1488  *      Initialize IP Virtual Server
1489  */
1490 static int __init ip_vs_init(void)
1491 {
1492         int ret;
1493
1494         ip_vs_estimator_init();
1495
1496         ret = ip_vs_control_init();
1497         if (ret < 0) {
1498                 IP_VS_ERR("can't setup control.\n");
1499                 goto cleanup_estimator;
1500         }
1501
1502         ip_vs_protocol_init();
1503
1504         ret = ip_vs_app_init();
1505         if (ret < 0) {
1506                 IP_VS_ERR("can't setup application helper.\n");
1507                 goto cleanup_protocol;
1508         }
1509
1510         ret = ip_vs_conn_init();
1511         if (ret < 0) {
1512                 IP_VS_ERR("can't setup connection table.\n");
1513                 goto cleanup_app;
1514         }
1515
1516         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1517         if (ret < 0) {
1518                 IP_VS_ERR("can't register hooks.\n");
1519                 goto cleanup_conn;
1520         }
1521
1522         IP_VS_INFO("ipvs loaded.\n");
1523         return ret;
1524
1525   cleanup_conn:
1526         ip_vs_conn_cleanup();
1527   cleanup_app:
1528         ip_vs_app_cleanup();
1529   cleanup_protocol:
1530         ip_vs_protocol_cleanup();
1531         ip_vs_control_cleanup();
1532   cleanup_estimator:
1533         ip_vs_estimator_cleanup();
1534         return ret;
1535 }
1536
1537 static void __exit ip_vs_cleanup(void)
1538 {
1539         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1540         ip_vs_conn_cleanup();
1541         ip_vs_app_cleanup();
1542         ip_vs_protocol_cleanup();
1543         ip_vs_control_cleanup();
1544         ip_vs_estimator_cleanup();
1545         IP_VS_INFO("ipvs unloaded.\n");
1546 }
1547
1548 module_init(ip_vs_init);
1549 module_exit(ip_vs_cleanup);
1550 MODULE_LICENSE("GPL");