]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/netfilter/nf_conntrack_core.c
[NETFILTER]: conntrack: fix race condition in early_drop
[linux-2.6-omap-h63xx.git] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24  *      - restructure nf_conn (introduce nf_conn_help)
25  *      - redesign 'features' how they were originally intended
26  * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27  *      - add support for L3 protocol module load on demand.
28  *
29  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30  */
31
32 #include <linux/types.h>
33 #include <linux/netfilter.h>
34 #include <linux/module.h>
35 #include <linux/skbuff.h>
36 #include <linux/proc_fs.h>
37 #include <linux/vmalloc.h>
38 #include <linux/stddef.h>
39 #include <linux/slab.h>
40 #include <linux/random.h>
41 #include <linux/jhash.h>
42 #include <linux/err.h>
43 #include <linux/percpu.h>
44 #include <linux/moduleparam.h>
45 #include <linux/notifier.h>
46 #include <linux/kernel.h>
47 #include <linux/netdevice.h>
48 #include <linux/socket.h>
49
50 /* This rwlock protects the main hash table, protocol/helper/expected
51    registrations, conntrack timers*/
52 #define ASSERT_READ_LOCK(x)
53 #define ASSERT_WRITE_LOCK(x)
54
55 #include <net/netfilter/nf_conntrack.h>
56 #include <net/netfilter/nf_conntrack_l3proto.h>
57 #include <net/netfilter/nf_conntrack_protocol.h>
58 #include <net/netfilter/nf_conntrack_helper.h>
59 #include <net/netfilter/nf_conntrack_core.h>
60
61 #define NF_CONNTRACK_VERSION    "0.5.0"
62
63 #if 0
64 #define DEBUGP printk
65 #else
66 #define DEBUGP(format, args...)
67 #endif
68
69 DEFINE_RWLOCK(nf_conntrack_lock);
70
71 /* nf_conntrack_standalone needs this */
72 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
73
74 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
75 LIST_HEAD(nf_conntrack_expect_list);
76 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
77 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
78 static LIST_HEAD(helpers);
79 unsigned int nf_conntrack_htable_size __read_mostly = 0;
80 int nf_conntrack_max __read_mostly;
81 struct list_head *nf_conntrack_hash;
82 static kmem_cache_t *nf_conntrack_expect_cachep;
83 struct nf_conn nf_conntrack_untracked;
84 unsigned int nf_ct_log_invalid __read_mostly;
85 static LIST_HEAD(unconfirmed);
86 static int nf_conntrack_vmalloc;
87
88 static unsigned int nf_conntrack_next_id;
89 static unsigned int nf_conntrack_expect_next_id;
90 #ifdef CONFIG_NF_CONNTRACK_EVENTS
91 ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
92 ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
93
94 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
95
96 /* deliver cached events and clear cache entry - must be called with locally
97  * disabled softirqs */
98 static inline void
99 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
100 {
101         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
102         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
103             && ecache->events)
104                 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
105                                     ecache->ct);
106
107         ecache->events = 0;
108         nf_ct_put(ecache->ct);
109         ecache->ct = NULL;
110 }
111
112 /* Deliver all cached events for a particular conntrack. This is called
113  * by code prior to async packet handling for freeing the skb */
114 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
115 {
116         struct nf_conntrack_ecache *ecache;
117
118         local_bh_disable();
119         ecache = &__get_cpu_var(nf_conntrack_ecache);
120         if (ecache->ct == ct)
121                 __nf_ct_deliver_cached_events(ecache);
122         local_bh_enable();
123 }
124
125 /* Deliver cached events for old pending events, if current conntrack != old */
126 void __nf_ct_event_cache_init(struct nf_conn *ct)
127 {
128         struct nf_conntrack_ecache *ecache;
129         
130         /* take care of delivering potentially old events */
131         ecache = &__get_cpu_var(nf_conntrack_ecache);
132         BUG_ON(ecache->ct == ct);
133         if (ecache->ct)
134                 __nf_ct_deliver_cached_events(ecache);
135         /* initialize for this conntrack/packet */
136         ecache->ct = ct;
137         nf_conntrack_get(&ct->ct_general);
138 }
139
140 /* flush the event cache - touches other CPU's data and must not be called
141  * while packets are still passing through the code */
142 static void nf_ct_event_cache_flush(void)
143 {
144         struct nf_conntrack_ecache *ecache;
145         int cpu;
146
147         for_each_possible_cpu(cpu) {
148                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
149                 if (ecache->ct)
150                         nf_ct_put(ecache->ct);
151         }
152 }
153 #else
154 static inline void nf_ct_event_cache_flush(void) {}
155 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
156
157 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
158 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
159
160 /*
161  * This scheme offers various size of "struct nf_conn" dependent on
162  * features(helper, nat, ...)
163  */
164
165 #define NF_CT_FEATURES_NAMELEN  256
166 static struct {
167         /* name of slab cache. printed in /proc/slabinfo */
168         char *name;
169
170         /* size of slab cache */
171         size_t size;
172
173         /* slab cache pointer */
174         kmem_cache_t *cachep;
175
176         /* allocated slab cache + modules which uses this slab cache */
177         int use;
178
179 } nf_ct_cache[NF_CT_F_NUM];
180
181 /* protect members of nf_ct_cache except of "use" */
182 DEFINE_RWLOCK(nf_ct_cache_lock);
183
184 /* This avoids calling kmem_cache_create() with same name simultaneously */
185 static DEFINE_MUTEX(nf_ct_cache_mutex);
186
187 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
188 struct nf_conntrack_protocol *
189 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
190 {
191         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
192                 return &nf_conntrack_generic_protocol;
193
194         return nf_ct_protos[l3proto][protocol];
195 }
196
197 /* this is guaranteed to always return a valid protocol helper, since
198  * it falls back to generic_protocol */
199 struct nf_conntrack_protocol *
200 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
201 {
202         struct nf_conntrack_protocol *p;
203
204         preempt_disable();
205         p = __nf_ct_proto_find(l3proto, protocol);
206         if (!try_module_get(p->me))
207                 p = &nf_conntrack_generic_protocol;
208         preempt_enable();
209         
210         return p;
211 }
212
213 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
214 {
215         module_put(p->me);
216 }
217
218 struct nf_conntrack_l3proto *
219 nf_ct_l3proto_find_get(u_int16_t l3proto)
220 {
221         struct nf_conntrack_l3proto *p;
222
223         preempt_disable();
224         p = __nf_ct_l3proto_find(l3proto);
225         if (!try_module_get(p->me))
226                 p = &nf_conntrack_generic_l3proto;
227         preempt_enable();
228
229         return p;
230 }
231
232 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
233 {
234         module_put(p->me);
235 }
236
237 int
238 nf_ct_l3proto_try_module_get(unsigned short l3proto)
239 {
240         int ret;
241         struct nf_conntrack_l3proto *p;
242
243 retry:  p = nf_ct_l3proto_find_get(l3proto);
244         if (p == &nf_conntrack_generic_l3proto) {
245                 ret = request_module("nf_conntrack-%d", l3proto);
246                 if (!ret)
247                         goto retry;
248
249                 return -EPROTOTYPE;
250         }
251
252         return 0;
253 }
254
255 void nf_ct_l3proto_module_put(unsigned short l3proto)
256 {
257         struct nf_conntrack_l3proto *p;
258
259         preempt_disable();
260         p = __nf_ct_l3proto_find(l3proto);
261         preempt_enable();
262
263         module_put(p->me);
264 }
265
266 static int nf_conntrack_hash_rnd_initted;
267 static unsigned int nf_conntrack_hash_rnd;
268
269 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
270                                   unsigned int size, unsigned int rnd)
271 {
272         unsigned int a, b;
273         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
274                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
275         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
276                         (tuple->src.u.all << 16) | tuple->dst.u.all);
277
278         return jhash_2words(a, b, rnd) % size;
279 }
280
281 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
282 {
283         return __hash_conntrack(tuple, nf_conntrack_htable_size,
284                                 nf_conntrack_hash_rnd);
285 }
286
287 int nf_conntrack_register_cache(u_int32_t features, const char *name,
288                                 size_t size)
289 {
290         int ret = 0;
291         char *cache_name;
292         kmem_cache_t *cachep;
293
294         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
295                features, name, size);
296
297         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
298                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
299                         features);
300                 return -EINVAL;
301         }
302
303         mutex_lock(&nf_ct_cache_mutex);
304
305         write_lock_bh(&nf_ct_cache_lock);
306         /* e.g: multiple helpers are loaded */
307         if (nf_ct_cache[features].use > 0) {
308                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
309                 if ((!strncmp(nf_ct_cache[features].name, name,
310                               NF_CT_FEATURES_NAMELEN))
311                     && nf_ct_cache[features].size == size) {
312                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
313                         nf_ct_cache[features].use++;
314                         ret = 0;
315                 } else
316                         ret = -EBUSY;
317
318                 write_unlock_bh(&nf_ct_cache_lock);
319                 mutex_unlock(&nf_ct_cache_mutex);
320                 return ret;
321         }
322         write_unlock_bh(&nf_ct_cache_lock);
323
324         /*
325          * The memory space for name of slab cache must be alive until
326          * cache is destroyed.
327          */
328         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
329         if (cache_name == NULL) {
330                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
331                 ret = -ENOMEM;
332                 goto out_up_mutex;
333         }
334
335         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
336                                                 >= NF_CT_FEATURES_NAMELEN) {
337                 printk("nf_conntrack_register_cache: name too long\n");
338                 ret = -EINVAL;
339                 goto out_free_name;
340         }
341
342         cachep = kmem_cache_create(cache_name, size, 0, 0,
343                                    NULL, NULL);
344         if (!cachep) {
345                 printk("nf_conntrack_register_cache: Can't create slab cache "
346                        "for the features = 0x%x\n", features);
347                 ret = -ENOMEM;
348                 goto out_free_name;
349         }
350
351         write_lock_bh(&nf_ct_cache_lock);
352         nf_ct_cache[features].use = 1;
353         nf_ct_cache[features].size = size;
354         nf_ct_cache[features].cachep = cachep;
355         nf_ct_cache[features].name = cache_name;
356         write_unlock_bh(&nf_ct_cache_lock);
357
358         goto out_up_mutex;
359
360 out_free_name:
361         kfree(cache_name);
362 out_up_mutex:
363         mutex_unlock(&nf_ct_cache_mutex);
364         return ret;
365 }
366
367 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
368 void nf_conntrack_unregister_cache(u_int32_t features)
369 {
370         kmem_cache_t *cachep;
371         char *name;
372
373         /*
374          * This assures that kmem_cache_create() isn't called before destroying
375          * slab cache.
376          */
377         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
378         mutex_lock(&nf_ct_cache_mutex);
379
380         write_lock_bh(&nf_ct_cache_lock);
381         if (--nf_ct_cache[features].use > 0) {
382                 write_unlock_bh(&nf_ct_cache_lock);
383                 mutex_unlock(&nf_ct_cache_mutex);
384                 return;
385         }
386         cachep = nf_ct_cache[features].cachep;
387         name = nf_ct_cache[features].name;
388         nf_ct_cache[features].cachep = NULL;
389         nf_ct_cache[features].name = NULL;
390         nf_ct_cache[features].size = 0;
391         write_unlock_bh(&nf_ct_cache_lock);
392
393         synchronize_net();
394
395         kmem_cache_destroy(cachep);
396         kfree(name);
397
398         mutex_unlock(&nf_ct_cache_mutex);
399 }
400
401 int
402 nf_ct_get_tuple(const struct sk_buff *skb,
403                 unsigned int nhoff,
404                 unsigned int dataoff,
405                 u_int16_t l3num,
406                 u_int8_t protonum,
407                 struct nf_conntrack_tuple *tuple,
408                 const struct nf_conntrack_l3proto *l3proto,
409                 const struct nf_conntrack_protocol *protocol)
410 {
411         NF_CT_TUPLE_U_BLANK(tuple);
412
413         tuple->src.l3num = l3num;
414         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
415                 return 0;
416
417         tuple->dst.protonum = protonum;
418         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
419
420         return protocol->pkt_to_tuple(skb, dataoff, tuple);
421 }
422
423 int
424 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
425                    const struct nf_conntrack_tuple *orig,
426                    const struct nf_conntrack_l3proto *l3proto,
427                    const struct nf_conntrack_protocol *protocol)
428 {
429         NF_CT_TUPLE_U_BLANK(inverse);
430
431         inverse->src.l3num = orig->src.l3num;
432         if (l3proto->invert_tuple(inverse, orig) == 0)
433                 return 0;
434
435         inverse->dst.dir = !orig->dst.dir;
436
437         inverse->dst.protonum = orig->dst.protonum;
438         return protocol->invert_tuple(inverse, orig);
439 }
440
441 /* nf_conntrack_expect helper functions */
442 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
443 {
444         struct nf_conn_help *master_help = nfct_help(exp->master);
445
446         NF_CT_ASSERT(master_help);
447         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
448         NF_CT_ASSERT(!timer_pending(&exp->timeout));
449
450         list_del(&exp->list);
451         NF_CT_STAT_INC(expect_delete);
452         master_help->expecting--;
453         nf_conntrack_expect_put(exp);
454 }
455
456 static void expectation_timed_out(unsigned long ul_expect)
457 {
458         struct nf_conntrack_expect *exp = (void *)ul_expect;
459
460         write_lock_bh(&nf_conntrack_lock);
461         nf_ct_unlink_expect(exp);
462         write_unlock_bh(&nf_conntrack_lock);
463         nf_conntrack_expect_put(exp);
464 }
465
466 struct nf_conntrack_expect *
467 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
468 {
469         struct nf_conntrack_expect *i;
470         
471         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
472                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
473                         atomic_inc(&i->use);
474                         return i;
475                 }
476         }
477         return NULL;
478 }
479
480 /* Just find a expectation corresponding to a tuple. */
481 struct nf_conntrack_expect *
482 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
483 {
484         struct nf_conntrack_expect *i;
485         
486         read_lock_bh(&nf_conntrack_lock);
487         i = __nf_conntrack_expect_find(tuple);
488         read_unlock_bh(&nf_conntrack_lock);
489
490         return i;
491 }
492
493 /* If an expectation for this connection is found, it gets delete from
494  * global list then returned. */
495 static struct nf_conntrack_expect *
496 find_expectation(const struct nf_conntrack_tuple *tuple)
497 {
498         struct nf_conntrack_expect *i;
499
500         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
501         /* If master is not in hash table yet (ie. packet hasn't left
502            this machine yet), how can other end know about expected?
503            Hence these are not the droids you are looking for (if
504            master ct never got confirmed, we'd hold a reference to it
505            and weird things would happen to future packets). */
506                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
507                     && nf_ct_is_confirmed(i->master)) {
508                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
509                                 atomic_inc(&i->use);
510                                 return i;
511                         } else if (del_timer(&i->timeout)) {
512                                 nf_ct_unlink_expect(i);
513                                 return i;
514                         }
515                 }
516         }
517         return NULL;
518 }
519
520 /* delete all expectations for this conntrack */
521 void nf_ct_remove_expectations(struct nf_conn *ct)
522 {
523         struct nf_conntrack_expect *i, *tmp;
524         struct nf_conn_help *help = nfct_help(ct);
525
526         /* Optimization: most connection never expect any others. */
527         if (!help || help->expecting == 0)
528                 return;
529
530         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
531                 if (i->master == ct && del_timer(&i->timeout)) {
532                         nf_ct_unlink_expect(i);
533                         nf_conntrack_expect_put(i);
534                 }
535         }
536 }
537
538 static void
539 clean_from_lists(struct nf_conn *ct)
540 {
541         DEBUGP("clean_from_lists(%p)\n", ct);
542         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
543         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
544         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
545
546         /* Destroy all pending expectations */
547         nf_ct_remove_expectations(ct);
548 }
549
550 static void
551 destroy_conntrack(struct nf_conntrack *nfct)
552 {
553         struct nf_conn *ct = (struct nf_conn *)nfct;
554         struct nf_conntrack_l3proto *l3proto;
555         struct nf_conntrack_protocol *proto;
556
557         DEBUGP("destroy_conntrack(%p)\n", ct);
558         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
559         NF_CT_ASSERT(!timer_pending(&ct->timeout));
560
561         nf_conntrack_event(IPCT_DESTROY, ct);
562         set_bit(IPS_DYING_BIT, &ct->status);
563
564         /* To make sure we don't get any weird locking issues here:
565          * destroy_conntrack() MUST NOT be called with a write lock
566          * to nf_conntrack_lock!!! -HW */
567         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
568         if (l3proto && l3proto->destroy)
569                 l3proto->destroy(ct);
570
571         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
572         if (proto && proto->destroy)
573                 proto->destroy(ct);
574
575         if (nf_conntrack_destroyed)
576                 nf_conntrack_destroyed(ct);
577
578         write_lock_bh(&nf_conntrack_lock);
579         /* Expectations will have been removed in clean_from_lists,
580          * except TFTP can create an expectation on the first packet,
581          * before connection is in the list, so we need to clean here,
582          * too. */
583         nf_ct_remove_expectations(ct);
584
585         /* We overload first tuple to link into unconfirmed list. */
586         if (!nf_ct_is_confirmed(ct)) {
587                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
588                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
589         }
590
591         NF_CT_STAT_INC(delete);
592         write_unlock_bh(&nf_conntrack_lock);
593
594         if (ct->master)
595                 nf_ct_put(ct->master);
596
597         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
598         nf_conntrack_free(ct);
599 }
600
601 static void death_by_timeout(unsigned long ul_conntrack)
602 {
603         struct nf_conn *ct = (void *)ul_conntrack;
604
605         write_lock_bh(&nf_conntrack_lock);
606         /* Inside lock so preempt is disabled on module removal path.
607          * Otherwise we can get spurious warnings. */
608         NF_CT_STAT_INC(delete_list);
609         clean_from_lists(ct);
610         write_unlock_bh(&nf_conntrack_lock);
611         nf_ct_put(ct);
612 }
613
614 struct nf_conntrack_tuple_hash *
615 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
616                     const struct nf_conn *ignored_conntrack)
617 {
618         struct nf_conntrack_tuple_hash *h;
619         unsigned int hash = hash_conntrack(tuple);
620
621         ASSERT_READ_LOCK(&nf_conntrack_lock);
622         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
623                 if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
624                     nf_ct_tuple_equal(tuple, &h->tuple)) {
625                         NF_CT_STAT_INC(found);
626                         return h;
627                 }
628                 NF_CT_STAT_INC(searched);
629         }
630
631         return NULL;
632 }
633
634 /* Find a connection corresponding to a tuple. */
635 struct nf_conntrack_tuple_hash *
636 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
637                       const struct nf_conn *ignored_conntrack)
638 {
639         struct nf_conntrack_tuple_hash *h;
640
641         read_lock_bh(&nf_conntrack_lock);
642         h = __nf_conntrack_find(tuple, ignored_conntrack);
643         if (h)
644                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
645         read_unlock_bh(&nf_conntrack_lock);
646
647         return h;
648 }
649
650 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
651                                        unsigned int hash,
652                                        unsigned int repl_hash) 
653 {
654         ct->id = ++nf_conntrack_next_id;
655         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
656                  &nf_conntrack_hash[hash]);
657         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
658                  &nf_conntrack_hash[repl_hash]);
659 }
660
661 void nf_conntrack_hash_insert(struct nf_conn *ct)
662 {
663         unsigned int hash, repl_hash;
664
665         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
666         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
667
668         write_lock_bh(&nf_conntrack_lock);
669         __nf_conntrack_hash_insert(ct, hash, repl_hash);
670         write_unlock_bh(&nf_conntrack_lock);
671 }
672
673 /* Confirm a connection given skb; places it in hash table */
674 int
675 __nf_conntrack_confirm(struct sk_buff **pskb)
676 {
677         unsigned int hash, repl_hash;
678         struct nf_conntrack_tuple_hash *h;
679         struct nf_conn *ct;
680         struct nf_conn_help *help;
681         enum ip_conntrack_info ctinfo;
682
683         ct = nf_ct_get(*pskb, &ctinfo);
684
685         /* ipt_REJECT uses nf_conntrack_attach to attach related
686            ICMP/TCP RST packets in other direction.  Actual packet
687            which created connection will be IP_CT_NEW or for an
688            expected connection, IP_CT_RELATED. */
689         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
690                 return NF_ACCEPT;
691
692         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
693         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
694
695         /* We're not in hash table, and we refuse to set up related
696            connections for unconfirmed conns.  But packet copies and
697            REJECT will give spurious warnings here. */
698         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
699
700         /* No external references means noone else could have
701            confirmed us. */
702         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
703         DEBUGP("Confirming conntrack %p\n", ct);
704
705         write_lock_bh(&nf_conntrack_lock);
706
707         /* See if there's one in the list already, including reverse:
708            NAT could have grabbed it without realizing, since we're
709            not in the hash.  If there is, we lost race. */
710         list_for_each_entry(h, &nf_conntrack_hash[hash], list)
711                 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
712                                       &h->tuple))
713                         goto out;
714         list_for_each_entry(h, &nf_conntrack_hash[repl_hash], list)
715                 if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
716                                       &h->tuple))
717                         goto out;
718
719         /* Remove from unconfirmed list */
720         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
721
722         __nf_conntrack_hash_insert(ct, hash, repl_hash);
723         /* Timer relative to confirmation time, not original
724            setting time, otherwise we'd get timer wrap in
725            weird delay cases. */
726         ct->timeout.expires += jiffies;
727         add_timer(&ct->timeout);
728         atomic_inc(&ct->ct_general.use);
729         set_bit(IPS_CONFIRMED_BIT, &ct->status);
730         NF_CT_STAT_INC(insert);
731         write_unlock_bh(&nf_conntrack_lock);
732         help = nfct_help(ct);
733         if (help && help->helper)
734                 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
735 #ifdef CONFIG_NF_NAT_NEEDED
736         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
737             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
738                 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
739 #endif
740         nf_conntrack_event_cache(master_ct(ct) ?
741                                  IPCT_RELATED : IPCT_NEW, *pskb);
742         return NF_ACCEPT;
743
744 out:
745         NF_CT_STAT_INC(insert_failed);
746         write_unlock_bh(&nf_conntrack_lock);
747         return NF_DROP;
748 }
749
750 /* Returns true if a connection correspondings to the tuple (required
751    for NAT). */
752 int
753 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
754                          const struct nf_conn *ignored_conntrack)
755 {
756         struct nf_conntrack_tuple_hash *h;
757
758         read_lock_bh(&nf_conntrack_lock);
759         h = __nf_conntrack_find(tuple, ignored_conntrack);
760         read_unlock_bh(&nf_conntrack_lock);
761
762         return h != NULL;
763 }
764
765 /* There's a small race here where we may free a just-assured
766    connection.  Too bad: we're in trouble anyway. */
767 static int early_drop(struct list_head *chain)
768 {
769         /* Traverse backwards: gives us oldest, which is roughly LRU */
770         struct nf_conntrack_tuple_hash *h;
771         struct nf_conn *ct = NULL, *tmp;
772         int dropped = 0;
773
774         read_lock_bh(&nf_conntrack_lock);
775         list_for_each_entry_reverse(h, chain, list) {
776                 tmp = nf_ct_tuplehash_to_ctrack(h);
777                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
778                         ct = tmp;
779                         atomic_inc(&ct->ct_general.use);
780                         break;
781                 }
782         }
783         read_unlock_bh(&nf_conntrack_lock);
784
785         if (!ct)
786                 return dropped;
787
788         if (del_timer(&ct->timeout)) {
789                 death_by_timeout((unsigned long)ct);
790                 dropped = 1;
791                 NF_CT_STAT_INC(early_drop);
792         }
793         nf_ct_put(ct);
794         return dropped;
795 }
796
797 static struct nf_conntrack_helper *
798 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
799 {
800         struct nf_conntrack_helper *h;
801
802         list_for_each_entry(h, &helpers, list) {
803                 if (nf_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
804                         return h;
805         }
806         return NULL;
807 }
808
809 struct nf_conntrack_helper *
810 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
811 {
812         struct nf_conntrack_helper *helper;
813
814         /* need nf_conntrack_lock to assure that helper exists until
815          * try_module_get() is called */
816         read_lock_bh(&nf_conntrack_lock);
817
818         helper = __nf_ct_helper_find(tuple);
819         if (helper) {
820                 /* need to increase module usage count to assure helper will
821                  * not go away while the caller is e.g. busy putting a
822                  * conntrack in the hash that uses the helper */
823                 if (!try_module_get(helper->me))
824                         helper = NULL;
825         }
826
827         read_unlock_bh(&nf_conntrack_lock);
828
829         return helper;
830 }
831
832 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
833 {
834         module_put(helper->me);
835 }
836
837 static struct nf_conn *
838 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
839                      const struct nf_conntrack_tuple *repl,
840                      const struct nf_conntrack_l3proto *l3proto)
841 {
842         struct nf_conn *conntrack = NULL;
843         u_int32_t features = 0;
844         struct nf_conntrack_helper *helper;
845
846         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
847                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
848                 nf_conntrack_hash_rnd_initted = 1;
849         }
850
851         /* We don't want any race condition at early drop stage */
852         atomic_inc(&nf_conntrack_count);
853
854         if (nf_conntrack_max
855             && atomic_read(&nf_conntrack_count) > nf_conntrack_max) {
856                 unsigned int hash = hash_conntrack(orig);
857                 /* Try dropping from this hash chain. */
858                 if (!early_drop(&nf_conntrack_hash[hash])) {
859                         atomic_dec(&nf_conntrack_count);
860                         if (net_ratelimit())
861                                 printk(KERN_WARNING
862                                        "nf_conntrack: table full, dropping"
863                                        " packet.\n");
864                         return ERR_PTR(-ENOMEM);
865                 }
866         }
867
868         /*  find features needed by this conntrack. */
869         features = l3proto->get_features(orig);
870
871         /* FIXME: protect helper list per RCU */
872         read_lock_bh(&nf_conntrack_lock);
873         helper = __nf_ct_helper_find(repl);
874         if (helper)
875                 features |= NF_CT_F_HELP;
876         read_unlock_bh(&nf_conntrack_lock);
877
878         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
879
880         read_lock_bh(&nf_ct_cache_lock);
881
882         if (unlikely(!nf_ct_cache[features].use)) {
883                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
884                         features);
885                 goto out;
886         }
887
888         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
889         if (conntrack == NULL) {
890                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
891                 goto out;
892         }
893
894         memset(conntrack, 0, nf_ct_cache[features].size);
895         conntrack->features = features;
896         if (helper) {
897                 struct nf_conn_help *help = nfct_help(conntrack);
898                 NF_CT_ASSERT(help);
899                 help->helper = helper;
900         }
901
902         atomic_set(&conntrack->ct_general.use, 1);
903         conntrack->ct_general.destroy = destroy_conntrack;
904         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
905         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
906         /* Don't set timer yet: wait for confirmation */
907         init_timer(&conntrack->timeout);
908         conntrack->timeout.data = (unsigned long)conntrack;
909         conntrack->timeout.function = death_by_timeout;
910         read_unlock_bh(&nf_ct_cache_lock);
911
912         return conntrack;
913 out:
914         read_unlock_bh(&nf_ct_cache_lock);
915         atomic_dec(&nf_conntrack_count);
916         return conntrack;
917 }
918
919 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
920                                    const struct nf_conntrack_tuple *repl)
921 {
922         struct nf_conntrack_l3proto *l3proto;
923
924         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
925         return __nf_conntrack_alloc(orig, repl, l3proto);
926 }
927
928 void nf_conntrack_free(struct nf_conn *conntrack)
929 {
930         u_int32_t features = conntrack->features;
931         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
932         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
933                conntrack);
934         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
935         atomic_dec(&nf_conntrack_count);
936 }
937
938 /* Allocate a new conntrack: we return -ENOMEM if classification
939    failed due to stress.  Otherwise it really is unclassifiable. */
940 static struct nf_conntrack_tuple_hash *
941 init_conntrack(const struct nf_conntrack_tuple *tuple,
942                struct nf_conntrack_l3proto *l3proto,
943                struct nf_conntrack_protocol *protocol,
944                struct sk_buff *skb,
945                unsigned int dataoff)
946 {
947         struct nf_conn *conntrack;
948         struct nf_conntrack_tuple repl_tuple;
949         struct nf_conntrack_expect *exp;
950
951         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
952                 DEBUGP("Can't invert tuple.\n");
953                 return NULL;
954         }
955
956         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
957         if (conntrack == NULL || IS_ERR(conntrack)) {
958                 DEBUGP("Can't allocate conntrack.\n");
959                 return (struct nf_conntrack_tuple_hash *)conntrack;
960         }
961
962         if (!protocol->new(conntrack, skb, dataoff)) {
963                 nf_conntrack_free(conntrack);
964                 DEBUGP("init conntrack: can't track with proto module\n");
965                 return NULL;
966         }
967
968         write_lock_bh(&nf_conntrack_lock);
969         exp = find_expectation(tuple);
970
971         if (exp) {
972                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
973                         conntrack, exp);
974                 /* Welcome, Mr. Bond.  We've been expecting you... */
975                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
976                 conntrack->master = exp->master;
977 #ifdef CONFIG_NF_CONNTRACK_MARK
978                 conntrack->mark = exp->master->mark;
979 #endif
980 #ifdef CONFIG_NF_CONNTRACK_SECMARK
981                 conntrack->secmark = exp->master->secmark;
982 #endif
983                 nf_conntrack_get(&conntrack->master->ct_general);
984                 NF_CT_STAT_INC(expect_new);
985         } else
986                 NF_CT_STAT_INC(new);
987
988         /* Overload tuple linked list to put us in unconfirmed list. */
989         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
990
991         write_unlock_bh(&nf_conntrack_lock);
992
993         if (exp) {
994                 if (exp->expectfn)
995                         exp->expectfn(conntrack, exp);
996                 nf_conntrack_expect_put(exp);
997         }
998
999         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1000 }
1001
1002 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1003 static inline struct nf_conn *
1004 resolve_normal_ct(struct sk_buff *skb,
1005                   unsigned int dataoff,
1006                   u_int16_t l3num,
1007                   u_int8_t protonum,
1008                   struct nf_conntrack_l3proto *l3proto,
1009                   struct nf_conntrack_protocol *proto,
1010                   int *set_reply,
1011                   enum ip_conntrack_info *ctinfo)
1012 {
1013         struct nf_conntrack_tuple tuple;
1014         struct nf_conntrack_tuple_hash *h;
1015         struct nf_conn *ct;
1016
1017         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1018                              dataoff, l3num, protonum, &tuple, l3proto,
1019                              proto)) {
1020                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1021                 return NULL;
1022         }
1023
1024         /* look for tuple match */
1025         h = nf_conntrack_find_get(&tuple, NULL);
1026         if (!h) {
1027                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1028                 if (!h)
1029                         return NULL;
1030                 if (IS_ERR(h))
1031                         return (void *)h;
1032         }
1033         ct = nf_ct_tuplehash_to_ctrack(h);
1034
1035         /* It exists; we have (non-exclusive) reference. */
1036         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1037                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1038                 /* Please set reply bit if this packet OK */
1039                 *set_reply = 1;
1040         } else {
1041                 /* Once we've had two way comms, always ESTABLISHED. */
1042                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1043                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1044                         *ctinfo = IP_CT_ESTABLISHED;
1045                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1046                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1047                         *ctinfo = IP_CT_RELATED;
1048                 } else {
1049                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1050                         *ctinfo = IP_CT_NEW;
1051                 }
1052                 *set_reply = 0;
1053         }
1054         skb->nfct = &ct->ct_general;
1055         skb->nfctinfo = *ctinfo;
1056         return ct;
1057 }
1058
1059 unsigned int
1060 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1061 {
1062         struct nf_conn *ct;
1063         enum ip_conntrack_info ctinfo;
1064         struct nf_conntrack_l3proto *l3proto;
1065         struct nf_conntrack_protocol *proto;
1066         unsigned int dataoff;
1067         u_int8_t protonum;
1068         int set_reply = 0;
1069         int ret;
1070
1071         /* Previously seen (loopback or untracked)?  Ignore. */
1072         if ((*pskb)->nfct) {
1073                 NF_CT_STAT_INC(ignore);
1074                 return NF_ACCEPT;
1075         }
1076
1077         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1078         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1079                 DEBUGP("not prepared to track yet or error occured\n");
1080                 return -ret;
1081         }
1082
1083         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1084
1085         /* It may be an special packet, error, unclean...
1086          * inverse of the return code tells to the netfilter
1087          * core what to do with the packet. */
1088         if (proto->error != NULL &&
1089             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1090                 NF_CT_STAT_INC(error);
1091                 NF_CT_STAT_INC(invalid);
1092                 return -ret;
1093         }
1094
1095         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1096                                &set_reply, &ctinfo);
1097         if (!ct) {
1098                 /* Not valid part of a connection */
1099                 NF_CT_STAT_INC(invalid);
1100                 return NF_ACCEPT;
1101         }
1102
1103         if (IS_ERR(ct)) {
1104                 /* Too stressed to deal. */
1105                 NF_CT_STAT_INC(drop);
1106                 return NF_DROP;
1107         }
1108
1109         NF_CT_ASSERT((*pskb)->nfct);
1110
1111         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1112         if (ret < 0) {
1113                 /* Invalid: inverse of the return code tells
1114                  * the netfilter core what to do */
1115                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1116                 nf_conntrack_put((*pskb)->nfct);
1117                 (*pskb)->nfct = NULL;
1118                 NF_CT_STAT_INC(invalid);
1119                 return -ret;
1120         }
1121
1122         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1123                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1124
1125         return ret;
1126 }
1127
1128 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1129                          const struct nf_conntrack_tuple *orig)
1130 {
1131         return nf_ct_invert_tuple(inverse, orig,
1132                                   __nf_ct_l3proto_find(orig->src.l3num),
1133                                   __nf_ct_proto_find(orig->src.l3num,
1134                                                      orig->dst.protonum));
1135 }
1136
1137 /* Would two expected things clash? */
1138 static inline int expect_clash(const struct nf_conntrack_expect *a,
1139                                const struct nf_conntrack_expect *b)
1140 {
1141         /* Part covered by intersection of masks must be unequal,
1142            otherwise they clash */
1143         struct nf_conntrack_tuple intersect_mask;
1144         int count;
1145
1146         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1147         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1148         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1149         intersect_mask.dst.protonum = a->mask.dst.protonum
1150                                         & b->mask.dst.protonum;
1151
1152         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1153                 intersect_mask.src.u3.all[count] =
1154                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1155         }
1156
1157         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1158                 intersect_mask.dst.u3.all[count] =
1159                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1160         }
1161
1162         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1163 }
1164
1165 static inline int expect_matches(const struct nf_conntrack_expect *a,
1166                                  const struct nf_conntrack_expect *b)
1167 {
1168         return a->master == b->master
1169                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1170                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1171 }
1172
1173 /* Generally a bad idea to call this: could have matched already. */
1174 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1175 {
1176         struct nf_conntrack_expect *i;
1177
1178         write_lock_bh(&nf_conntrack_lock);
1179         /* choose the the oldest expectation to evict */
1180         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1181                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1182                         nf_ct_unlink_expect(i);
1183                         write_unlock_bh(&nf_conntrack_lock);
1184                         nf_conntrack_expect_put(i);
1185                         return;
1186                 }
1187         }
1188         write_unlock_bh(&nf_conntrack_lock);
1189 }
1190
1191 /* We don't increase the master conntrack refcount for non-fulfilled
1192  * conntracks. During the conntrack destruction, the expectations are
1193  * always killed before the conntrack itself */
1194 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1195 {
1196         struct nf_conntrack_expect *new;
1197
1198         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1199         if (!new) {
1200                 DEBUGP("expect_related: OOM allocating expect\n");
1201                 return NULL;
1202         }
1203         new->master = me;
1204         atomic_set(&new->use, 1);
1205         return new;
1206 }
1207
1208 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1209 {
1210         if (atomic_dec_and_test(&exp->use))
1211                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1212 }
1213
1214 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1215 {
1216         struct nf_conn_help *master_help = nfct_help(exp->master);
1217
1218         atomic_inc(&exp->use);
1219         master_help->expecting++;
1220         list_add(&exp->list, &nf_conntrack_expect_list);
1221
1222         init_timer(&exp->timeout);
1223         exp->timeout.data = (unsigned long)exp;
1224         exp->timeout.function = expectation_timed_out;
1225         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1226         add_timer(&exp->timeout);
1227
1228         exp->id = ++nf_conntrack_expect_next_id;
1229         atomic_inc(&exp->use);
1230         NF_CT_STAT_INC(expect_create);
1231 }
1232
1233 /* Race with expectations being used means we could have none to find; OK. */
1234 static void evict_oldest_expect(struct nf_conn *master)
1235 {
1236         struct nf_conntrack_expect *i;
1237
1238         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1239                 if (i->master == master) {
1240                         if (del_timer(&i->timeout)) {
1241                                 nf_ct_unlink_expect(i);
1242                                 nf_conntrack_expect_put(i);
1243                         }
1244                         break;
1245                 }
1246         }
1247 }
1248
1249 static inline int refresh_timer(struct nf_conntrack_expect *i)
1250 {
1251         struct nf_conn_help *master_help = nfct_help(i->master);
1252
1253         if (!del_timer(&i->timeout))
1254                 return 0;
1255
1256         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1257         add_timer(&i->timeout);
1258         return 1;
1259 }
1260
1261 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1262 {
1263         struct nf_conntrack_expect *i;
1264         struct nf_conn *master = expect->master;
1265         struct nf_conn_help *master_help = nfct_help(master);
1266         int ret;
1267
1268         NF_CT_ASSERT(master_help);
1269
1270         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1271         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1272         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1273
1274         write_lock_bh(&nf_conntrack_lock);
1275         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1276                 if (expect_matches(i, expect)) {
1277                         /* Refresh timer: if it's dying, ignore.. */
1278                         if (refresh_timer(i)) {
1279                                 ret = 0;
1280                                 goto out;
1281                         }
1282                 } else if (expect_clash(i, expect)) {
1283                         ret = -EBUSY;
1284                         goto out;
1285                 }
1286         }
1287         /* Will be over limit? */
1288         if (master_help->helper->max_expected &&
1289             master_help->expecting >= master_help->helper->max_expected)
1290                 evict_oldest_expect(master);
1291
1292         nf_conntrack_expect_insert(expect);
1293         nf_conntrack_expect_event(IPEXP_NEW, expect);
1294         ret = 0;
1295 out:
1296         write_unlock_bh(&nf_conntrack_lock);
1297         return ret;
1298 }
1299
1300 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1301 {
1302         int ret;
1303         BUG_ON(me->timeout == 0);
1304
1305         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1306                                           sizeof(struct nf_conn)
1307                                           + sizeof(struct nf_conn_help)
1308                                           + __alignof__(struct nf_conn_help));
1309         if (ret < 0) {
1310                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1311                 return ret;
1312         }
1313         write_lock_bh(&nf_conntrack_lock);
1314         list_add(&me->list, &helpers);
1315         write_unlock_bh(&nf_conntrack_lock);
1316
1317         return 0;
1318 }
1319
1320 struct nf_conntrack_helper *
1321 __nf_conntrack_helper_find_byname(const char *name)
1322 {
1323         struct nf_conntrack_helper *h;
1324
1325         list_for_each_entry(h, &helpers, list) {
1326                 if (!strcmp(h->name, name))
1327                         return h;
1328         }
1329
1330         return NULL;
1331 }
1332
1333 static inline void unhelp(struct nf_conntrack_tuple_hash *i,
1334                           const struct nf_conntrack_helper *me)
1335 {
1336         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1337         struct nf_conn_help *help = nfct_help(ct);
1338
1339         if (help && help->helper == me) {
1340                 nf_conntrack_event(IPCT_HELPER, ct);
1341                 help->helper = NULL;
1342         }
1343 }
1344
1345 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1346 {
1347         unsigned int i;
1348         struct nf_conntrack_tuple_hash *h;
1349         struct nf_conntrack_expect *exp, *tmp;
1350
1351         /* Need write lock here, to delete helper. */
1352         write_lock_bh(&nf_conntrack_lock);
1353         list_del(&me->list);
1354
1355         /* Get rid of expectations */
1356         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1357                 struct nf_conn_help *help = nfct_help(exp->master);
1358                 if (help->helper == me && del_timer(&exp->timeout)) {
1359                         nf_ct_unlink_expect(exp);
1360                         nf_conntrack_expect_put(exp);
1361                 }
1362         }
1363
1364         /* Get rid of expecteds, set helpers to NULL. */
1365         list_for_each_entry(h, &unconfirmed, list)
1366                 unhelp(h, me);
1367         for (i = 0; i < nf_conntrack_htable_size; i++) {
1368                 list_for_each_entry(h, &nf_conntrack_hash[i], list)
1369                         unhelp(h, me);
1370         }
1371         write_unlock_bh(&nf_conntrack_lock);
1372
1373         /* Someone could be still looking at the helper in a bh. */
1374         synchronize_net();
1375 }
1376
1377 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1378 void __nf_ct_refresh_acct(struct nf_conn *ct,
1379                           enum ip_conntrack_info ctinfo,
1380                           const struct sk_buff *skb,
1381                           unsigned long extra_jiffies,
1382                           int do_acct)
1383 {
1384         int event = 0;
1385
1386         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1387         NF_CT_ASSERT(skb);
1388
1389         write_lock_bh(&nf_conntrack_lock);
1390
1391         /* Only update if this is not a fixed timeout */
1392         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1393                 write_unlock_bh(&nf_conntrack_lock);
1394                 return;
1395         }
1396
1397         /* If not in hash table, timer will not be active yet */
1398         if (!nf_ct_is_confirmed(ct)) {
1399                 ct->timeout.expires = extra_jiffies;
1400                 event = IPCT_REFRESH;
1401         } else {
1402                 /* Need del_timer for race avoidance (may already be dying). */
1403                 if (del_timer(&ct->timeout)) {
1404                         ct->timeout.expires = jiffies + extra_jiffies;
1405                         add_timer(&ct->timeout);
1406                         event = IPCT_REFRESH;
1407                 }
1408         }
1409
1410 #ifdef CONFIG_NF_CT_ACCT
1411         if (do_acct) {
1412                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1413                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1414                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1415         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1416             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1417                 event |= IPCT_COUNTER_FILLING;
1418         }
1419 #endif
1420
1421         write_unlock_bh(&nf_conntrack_lock);
1422
1423         /* must be unlocked when calling event cache */
1424         if (event)
1425                 nf_conntrack_event_cache(event, skb);
1426 }
1427
1428 #if defined(CONFIG_NF_CT_NETLINK) || \
1429     defined(CONFIG_NF_CT_NETLINK_MODULE)
1430
1431 #include <linux/netfilter/nfnetlink.h>
1432 #include <linux/netfilter/nfnetlink_conntrack.h>
1433 #include <linux/mutex.h>
1434
1435
1436 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1437  * in ip_conntrack_core, since we don't want the protocols to autoload
1438  * or depend on ctnetlink */
1439 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1440                                const struct nf_conntrack_tuple *tuple)
1441 {
1442         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1443                 &tuple->src.u.tcp.port);
1444         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1445                 &tuple->dst.u.tcp.port);
1446         return 0;
1447
1448 nfattr_failure:
1449         return -1;
1450 }
1451
1452 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1453         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1454         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1455 };
1456
1457 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1458                                struct nf_conntrack_tuple *t)
1459 {
1460         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1461                 return -EINVAL;
1462
1463         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1464                 return -EINVAL;
1465
1466         t->src.u.tcp.port =
1467                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1468         t->dst.u.tcp.port =
1469                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1470
1471         return 0;
1472 }
1473 #endif
1474
1475 /* Used by ipt_REJECT and ip6t_REJECT. */
1476 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1477 {
1478         struct nf_conn *ct;
1479         enum ip_conntrack_info ctinfo;
1480
1481         /* This ICMP is in reverse direction to the packet which caused it */
1482         ct = nf_ct_get(skb, &ctinfo);
1483         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1484                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1485         else
1486                 ctinfo = IP_CT_RELATED;
1487
1488         /* Attach to new skbuff, and increment count */
1489         nskb->nfct = &ct->ct_general;
1490         nskb->nfctinfo = ctinfo;
1491         nf_conntrack_get(nskb->nfct);
1492 }
1493
1494 static inline int
1495 do_iter(const struct nf_conntrack_tuple_hash *i,
1496         int (*iter)(struct nf_conn *i, void *data),
1497         void *data)
1498 {
1499         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1500 }
1501
1502 /* Bring out ya dead! */
1503 static struct nf_conn *
1504 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1505                 void *data, unsigned int *bucket)
1506 {
1507         struct nf_conntrack_tuple_hash *h;
1508         struct nf_conn *ct;
1509
1510         write_lock_bh(&nf_conntrack_lock);
1511         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1512                 list_for_each_entry(h, &nf_conntrack_hash[*bucket], list) {
1513                         ct = nf_ct_tuplehash_to_ctrack(h);
1514                         if (iter(ct, data))
1515                                 goto found;
1516                 }
1517         }
1518         list_for_each_entry(h, &unconfirmed, list) {
1519                 ct = nf_ct_tuplehash_to_ctrack(h);
1520                 if (iter(ct, data))
1521                         goto found;
1522         }
1523         return NULL;
1524 found:
1525         atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1526         write_unlock_bh(&nf_conntrack_lock);
1527         return ct;
1528 }
1529
1530 void
1531 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1532 {
1533         struct nf_conn *ct;
1534         unsigned int bucket = 0;
1535
1536         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1537                 /* Time to push up daises... */
1538                 if (del_timer(&ct->timeout))
1539                         death_by_timeout((unsigned long)ct);
1540                 /* ... else the timer will get him soon. */
1541
1542                 nf_ct_put(ct);
1543         }
1544 }
1545
1546 static int kill_all(struct nf_conn *i, void *data)
1547 {
1548         return 1;
1549 }
1550
1551 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1552 {
1553         if (vmalloced)
1554                 vfree(hash);
1555         else
1556                 free_pages((unsigned long)hash, 
1557                            get_order(sizeof(struct list_head) * size));
1558 }
1559
1560 void nf_conntrack_flush()
1561 {
1562         nf_ct_iterate_cleanup(kill_all, NULL);
1563 }
1564
1565 /* Mishearing the voices in his head, our hero wonders how he's
1566    supposed to kill the mall. */
1567 void nf_conntrack_cleanup(void)
1568 {
1569         int i;
1570
1571         ip_ct_attach = NULL;
1572
1573         /* This makes sure all current packets have passed through
1574            netfilter framework.  Roll on, two-stage module
1575            delete... */
1576         synchronize_net();
1577
1578         nf_ct_event_cache_flush();
1579  i_see_dead_people:
1580         nf_conntrack_flush();
1581         if (atomic_read(&nf_conntrack_count) != 0) {
1582                 schedule();
1583                 goto i_see_dead_people;
1584         }
1585         /* wait until all references to nf_conntrack_untracked are dropped */
1586         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1587                 schedule();
1588
1589         for (i = 0; i < NF_CT_F_NUM; i++) {
1590                 if (nf_ct_cache[i].use == 0)
1591                         continue;
1592
1593                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1594                 nf_ct_cache[i].use = 1;
1595                 nf_conntrack_unregister_cache(i);
1596         }
1597         kmem_cache_destroy(nf_conntrack_expect_cachep);
1598         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1599                             nf_conntrack_htable_size);
1600
1601         /* free l3proto protocol tables */
1602         for (i = 0; i < PF_MAX; i++)
1603                 if (nf_ct_protos[i]) {
1604                         kfree(nf_ct_protos[i]);
1605                         nf_ct_protos[i] = NULL;
1606                 }
1607 }
1608
1609 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1610 {
1611         struct list_head *hash;
1612         unsigned int i;
1613
1614         *vmalloced = 0; 
1615         hash = (void*)__get_free_pages(GFP_KERNEL, 
1616                                        get_order(sizeof(struct list_head)
1617                                                  * size));
1618         if (!hash) { 
1619                 *vmalloced = 1;
1620                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1621                 hash = vmalloc(sizeof(struct list_head) * size);
1622         }
1623
1624         if (hash)
1625                 for (i = 0; i < size; i++) 
1626                         INIT_LIST_HEAD(&hash[i]);
1627
1628         return hash;
1629 }
1630
1631 int set_hashsize(const char *val, struct kernel_param *kp)
1632 {
1633         int i, bucket, hashsize, vmalloced;
1634         int old_vmalloced, old_size;
1635         int rnd;
1636         struct list_head *hash, *old_hash;
1637         struct nf_conntrack_tuple_hash *h;
1638
1639         /* On boot, we can set this without any fancy locking. */
1640         if (!nf_conntrack_htable_size)
1641                 return param_set_uint(val, kp);
1642
1643         hashsize = simple_strtol(val, NULL, 0);
1644         if (!hashsize)
1645                 return -EINVAL;
1646
1647         hash = alloc_hashtable(hashsize, &vmalloced);
1648         if (!hash)
1649                 return -ENOMEM;
1650
1651         /* We have to rehahs for the new table anyway, so we also can
1652          * use a newrandom seed */
1653         get_random_bytes(&rnd, 4);
1654
1655         write_lock_bh(&nf_conntrack_lock);
1656         for (i = 0; i < nf_conntrack_htable_size; i++) {
1657                 while (!list_empty(&nf_conntrack_hash[i])) {
1658                         h = list_entry(nf_conntrack_hash[i].next,
1659                                        struct nf_conntrack_tuple_hash, list);
1660                         list_del(&h->list);
1661                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1662                         list_add_tail(&h->list, &hash[bucket]);
1663                 }
1664         }
1665         old_size = nf_conntrack_htable_size;
1666         old_vmalloced = nf_conntrack_vmalloc;
1667         old_hash = nf_conntrack_hash;
1668
1669         nf_conntrack_htable_size = hashsize;
1670         nf_conntrack_vmalloc = vmalloced;
1671         nf_conntrack_hash = hash;
1672         nf_conntrack_hash_rnd = rnd;
1673         write_unlock_bh(&nf_conntrack_lock);
1674
1675         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1676         return 0;
1677 }
1678
1679 module_param_call(hashsize, set_hashsize, param_get_uint,
1680                   &nf_conntrack_htable_size, 0600);
1681
1682 int __init nf_conntrack_init(void)
1683 {
1684         unsigned int i;
1685         int ret;
1686
1687         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1688          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1689         if (!nf_conntrack_htable_size) {
1690                 nf_conntrack_htable_size
1691                         = (((num_physpages << PAGE_SHIFT) / 16384)
1692                            / sizeof(struct list_head));
1693                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1694                         nf_conntrack_htable_size = 8192;
1695                 if (nf_conntrack_htable_size < 16)
1696                         nf_conntrack_htable_size = 16;
1697         }
1698         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1699
1700         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1701                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1702                nf_conntrack_max);
1703
1704         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1705                                             &nf_conntrack_vmalloc);
1706         if (!nf_conntrack_hash) {
1707                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1708                 goto err_out;
1709         }
1710
1711         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1712                                           sizeof(struct nf_conn));
1713         if (ret < 0) {
1714                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1715                 goto err_free_hash;
1716         }
1717
1718         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1719                                         sizeof(struct nf_conntrack_expect),
1720                                         0, 0, NULL, NULL);
1721         if (!nf_conntrack_expect_cachep) {
1722                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1723                 goto err_free_conntrack_slab;
1724         }
1725
1726         /* Don't NEED lock here, but good form anyway. */
1727         write_lock_bh(&nf_conntrack_lock);
1728         for (i = 0; i < PF_MAX; i++)
1729                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1730         write_unlock_bh(&nf_conntrack_lock);
1731
1732         /* For use by REJECT target */
1733         ip_ct_attach = __nf_conntrack_attach;
1734
1735         /* Set up fake conntrack:
1736             - to never be deleted, not in any hashes */
1737         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1738         /*  - and look it like as a confirmed connection */
1739         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1740
1741         return ret;
1742
1743 err_free_conntrack_slab:
1744         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1745 err_free_hash:
1746         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1747                             nf_conntrack_htable_size);
1748 err_out:
1749         return -ENOMEM;
1750 }