]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv4/netfilter/ip_conntrack_core.c
[NETFILTER]: conntrack: fix race condition in early_drop
[linux-2.6-omap-h63xx.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50
51 #define IP_CONNTRACK_VERSION    "2.4"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DEFINE_RWLOCK(ip_conntrack_lock);
60
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
63
64 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
65 LIST_HEAD(ip_conntrack_expect_list);
66 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
67 static LIST_HEAD(helpers);
68 unsigned int ip_conntrack_htable_size __read_mostly = 0;
69 int ip_conntrack_max __read_mostly;
70 struct list_head *ip_conntrack_hash;
71 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
72 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
73 struct ip_conntrack ip_conntrack_untracked;
74 unsigned int ip_ct_log_invalid __read_mostly;
75 static LIST_HEAD(unconfirmed);
76 static int ip_conntrack_vmalloc;
77
78 static unsigned int ip_conntrack_next_id;
79 static unsigned int ip_conntrack_expect_next_id;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
83
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
85
86 /* deliver cached events and clear cache entry - must be called with locally
87  * disabled softirqs */
88 static inline void
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
90 {
91         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
92         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
93                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
94                                     ecache->ct);
95         ecache->events = 0;
96         ip_conntrack_put(ecache->ct);
97         ecache->ct = NULL;
98 }
99
100 /* Deliver all cached events for a particular conntrack. This is called
101  * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
103 {
104         struct ip_conntrack_ecache *ecache;
105         
106         local_bh_disable();
107         ecache = &__get_cpu_var(ip_conntrack_ecache);
108         if (ecache->ct == ct)
109                 __ip_ct_deliver_cached_events(ecache);
110         local_bh_enable();
111 }
112
113 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
114 {
115         struct ip_conntrack_ecache *ecache;
116
117         /* take care of delivering potentially old events */
118         ecache = &__get_cpu_var(ip_conntrack_ecache);
119         BUG_ON(ecache->ct == ct);
120         if (ecache->ct)
121                 __ip_ct_deliver_cached_events(ecache);
122         /* initialize for this conntrack/packet */
123         ecache->ct = ct;
124         nf_conntrack_get(&ct->ct_general);
125 }
126
127 /* flush the event cache - touches other CPU's data and must not be called while
128  * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
130 {
131         struct ip_conntrack_ecache *ecache;
132         int cpu;
133
134         for_each_possible_cpu(cpu) {
135                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
136                 if (ecache->ct)
137                         ip_conntrack_put(ecache->ct);
138         }
139 }
140 #else
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
143
144 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
145
146 static int ip_conntrack_hash_rnd_initted;
147 static unsigned int ip_conntrack_hash_rnd;
148
149 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
150                             unsigned int size, unsigned int rnd)
151 {
152         return (jhash_3words(tuple->src.ip,
153                              (tuple->dst.ip ^ tuple->dst.protonum),
154                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
155                              rnd) % size);
156 }
157
158 static u_int32_t
159 hash_conntrack(const struct ip_conntrack_tuple *tuple)
160 {
161         return __hash_conntrack(tuple, ip_conntrack_htable_size,
162                                 ip_conntrack_hash_rnd);
163 }
164
165 int
166 ip_ct_get_tuple(const struct iphdr *iph,
167                 const struct sk_buff *skb,
168                 unsigned int dataoff,
169                 struct ip_conntrack_tuple *tuple,
170                 const struct ip_conntrack_protocol *protocol)
171 {
172         /* Never happen */
173         if (iph->frag_off & htons(IP_OFFSET)) {
174                 printk("ip_conntrack_core: Frag of proto %u.\n",
175                        iph->protocol);
176                 return 0;
177         }
178
179         tuple->src.ip = iph->saddr;
180         tuple->dst.ip = iph->daddr;
181         tuple->dst.protonum = iph->protocol;
182         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
183
184         return protocol->pkt_to_tuple(skb, dataoff, tuple);
185 }
186
187 int
188 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
189                    const struct ip_conntrack_tuple *orig,
190                    const struct ip_conntrack_protocol *protocol)
191 {
192         inverse->src.ip = orig->dst.ip;
193         inverse->dst.ip = orig->src.ip;
194         inverse->dst.protonum = orig->dst.protonum;
195         inverse->dst.dir = !orig->dst.dir;
196
197         return protocol->invert_tuple(inverse, orig);
198 }
199
200
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
203 {
204         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
205         IP_NF_ASSERT(!timer_pending(&exp->timeout));
206         list_del(&exp->list);
207         CONNTRACK_STAT_INC(expect_delete);
208         exp->master->expecting--;
209         ip_conntrack_expect_put(exp);
210 }
211
212 static void expectation_timed_out(unsigned long ul_expect)
213 {
214         struct ip_conntrack_expect *exp = (void *)ul_expect;
215
216         write_lock_bh(&ip_conntrack_lock);
217         ip_ct_unlink_expect(exp);
218         write_unlock_bh(&ip_conntrack_lock);
219         ip_conntrack_expect_put(exp);
220 }
221
222 struct ip_conntrack_expect *
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
224 {
225         struct ip_conntrack_expect *i;
226         
227         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
228                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
229                         atomic_inc(&i->use);
230                         return i;
231                 }
232         }
233         return NULL;
234 }
235
236 /* Just find a expectation corresponding to a tuple. */
237 struct ip_conntrack_expect *
238 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
239 {
240         struct ip_conntrack_expect *i;
241         
242         read_lock_bh(&ip_conntrack_lock);
243         i = __ip_conntrack_expect_find(tuple);
244         read_unlock_bh(&ip_conntrack_lock);
245
246         return i;
247 }
248
249 /* If an expectation for this connection is found, it gets delete from
250  * global list then returned. */
251 static struct ip_conntrack_expect *
252 find_expectation(const struct ip_conntrack_tuple *tuple)
253 {
254         struct ip_conntrack_expect *i;
255
256         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
257                 /* If master is not in hash table yet (ie. packet hasn't left
258                    this machine yet), how can other end know about expected?
259                    Hence these are not the droids you are looking for (if
260                    master ct never got confirmed, we'd hold a reference to it
261                    and weird things would happen to future packets). */
262                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
263                     && is_confirmed(i->master)) {
264                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
265                                 atomic_inc(&i->use);
266                                 return i;
267                         } else if (del_timer(&i->timeout)) {
268                                 ip_ct_unlink_expect(i);
269                                 return i;
270                         }
271                 }
272         }
273         return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279         struct ip_conntrack_expect *i, *tmp;
280
281         /* Optimization: most connection never expect any others. */
282         if (ct->expecting == 0)
283                 return;
284
285         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286                 if (i->master == ct && del_timer(&i->timeout)) {
287                         ip_ct_unlink_expect(i);
288                         ip_conntrack_expect_put(i);
289                 }
290         }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296         DEBUGP("clean_from_lists(%p)\n", ct);
297         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
299         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
300
301         /* Destroy all pending expectations */
302         ip_ct_remove_expectations(ct);
303 }
304
305 static void
306 destroy_conntrack(struct nf_conntrack *nfct)
307 {
308         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
309         struct ip_conntrack_protocol *proto;
310
311         DEBUGP("destroy_conntrack(%p)\n", ct);
312         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
313         IP_NF_ASSERT(!timer_pending(&ct->timeout));
314
315         ip_conntrack_event(IPCT_DESTROY, ct);
316         set_bit(IPS_DYING_BIT, &ct->status);
317
318         /* To make sure we don't get any weird locking issues here:
319          * destroy_conntrack() MUST NOT be called with a write lock
320          * to ip_conntrack_lock!!! -HW */
321         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322         if (proto && proto->destroy)
323                 proto->destroy(ct);
324
325         if (ip_conntrack_destroyed)
326                 ip_conntrack_destroyed(ct);
327
328         write_lock_bh(&ip_conntrack_lock);
329         /* Expectations will have been removed in clean_from_lists,
330          * except TFTP can create an expectation on the first packet,
331          * before connection is in the list, so we need to clean here,
332          * too. */
333         ip_ct_remove_expectations(ct);
334
335         /* We overload first tuple to link into unconfirmed list. */
336         if (!is_confirmed(ct)) {
337                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339         }
340
341         CONNTRACK_STAT_INC(delete);
342         write_unlock_bh(&ip_conntrack_lock);
343
344         if (ct->master)
345                 ip_conntrack_put(ct->master);
346
347         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
348         ip_conntrack_free(ct);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         write_lock_bh(&ip_conntrack_lock);
356         /* Inside lock so preempt is disabled on module removal path.
357          * Otherwise we can get spurious warnings. */
358         CONNTRACK_STAT_INC(delete_list);
359         clean_from_lists(ct);
360         write_unlock_bh(&ip_conntrack_lock);
361         ip_conntrack_put(ct);
362 }
363
364 struct ip_conntrack_tuple_hash *
365 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366                     const struct ip_conntrack *ignored_conntrack)
367 {
368         struct ip_conntrack_tuple_hash *h;
369         unsigned int hash = hash_conntrack(tuple);
370
371         ASSERT_READ_LOCK(&ip_conntrack_lock);
372         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
373                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
374                     ip_ct_tuple_equal(tuple, &h->tuple)) {
375                         CONNTRACK_STAT_INC(found);
376                         return h;
377                 }
378                 CONNTRACK_STAT_INC(searched);
379         }
380
381         return NULL;
382 }
383
384 /* Find a connection corresponding to a tuple. */
385 struct ip_conntrack_tuple_hash *
386 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
387                       const struct ip_conntrack *ignored_conntrack)
388 {
389         struct ip_conntrack_tuple_hash *h;
390
391         read_lock_bh(&ip_conntrack_lock);
392         h = __ip_conntrack_find(tuple, ignored_conntrack);
393         if (h)
394                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
395         read_unlock_bh(&ip_conntrack_lock);
396
397         return h;
398 }
399
400 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
401                                         unsigned int hash,
402                                         unsigned int repl_hash) 
403 {
404         ct->id = ++ip_conntrack_next_id;
405         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
406                  &ip_conntrack_hash[hash]);
407         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
408                  &ip_conntrack_hash[repl_hash]);
409 }
410
411 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
412 {
413         unsigned int hash, repl_hash;
414
415         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
416         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
417
418         write_lock_bh(&ip_conntrack_lock);
419         __ip_conntrack_hash_insert(ct, hash, repl_hash);
420         write_unlock_bh(&ip_conntrack_lock);
421 }
422
423 /* Confirm a connection given skb; places it in hash table */
424 int
425 __ip_conntrack_confirm(struct sk_buff **pskb)
426 {
427         unsigned int hash, repl_hash;
428         struct ip_conntrack_tuple_hash *h;
429         struct ip_conntrack *ct;
430         enum ip_conntrack_info ctinfo;
431
432         ct = ip_conntrack_get(*pskb, &ctinfo);
433
434         /* ipt_REJECT uses ip_conntrack_attach to attach related
435            ICMP/TCP RST packets in other direction.  Actual packet
436            which created connection will be IP_CT_NEW or for an
437            expected connection, IP_CT_RELATED. */
438         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
439                 return NF_ACCEPT;
440
441         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
442         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
443
444         /* We're not in hash table, and we refuse to set up related
445            connections for unconfirmed conns.  But packet copies and
446            REJECT will give spurious warnings here. */
447         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
448
449         /* No external references means noone else could have
450            confirmed us. */
451         IP_NF_ASSERT(!is_confirmed(ct));
452         DEBUGP("Confirming conntrack %p\n", ct);
453
454         write_lock_bh(&ip_conntrack_lock);
455
456         /* See if there's one in the list already, including reverse:
457            NAT could have grabbed it without realizing, since we're
458            not in the hash.  If there is, we lost race. */
459         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
460                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
461                                       &h->tuple))
462                         goto out;
463         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
464                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
465                                       &h->tuple))
466                         goto out;
467
468         /* Remove from unconfirmed list */
469         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
470
471         __ip_conntrack_hash_insert(ct, hash, repl_hash);
472         /* Timer relative to confirmation time, not original
473            setting time, otherwise we'd get timer wrap in
474            weird delay cases. */
475         ct->timeout.expires += jiffies;
476         add_timer(&ct->timeout);
477         atomic_inc(&ct->ct_general.use);
478         set_bit(IPS_CONFIRMED_BIT, &ct->status);
479         CONNTRACK_STAT_INC(insert);
480         write_unlock_bh(&ip_conntrack_lock);
481         if (ct->helper)
482                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
483 #ifdef CONFIG_IP_NF_NAT_NEEDED
484         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
485             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
486                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
487 #endif
488         ip_conntrack_event_cache(master_ct(ct) ?
489                                  IPCT_RELATED : IPCT_NEW, *pskb);
490
491         return NF_ACCEPT;
492
493 out:
494         CONNTRACK_STAT_INC(insert_failed);
495         write_unlock_bh(&ip_conntrack_lock);
496         return NF_DROP;
497 }
498
499 /* Returns true if a connection correspondings to the tuple (required
500    for NAT). */
501 int
502 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
503                          const struct ip_conntrack *ignored_conntrack)
504 {
505         struct ip_conntrack_tuple_hash *h;
506
507         read_lock_bh(&ip_conntrack_lock);
508         h = __ip_conntrack_find(tuple, ignored_conntrack);
509         read_unlock_bh(&ip_conntrack_lock);
510
511         return h != NULL;
512 }
513
514 /* There's a small race here where we may free a just-assured
515    connection.  Too bad: we're in trouble anyway. */
516 static int early_drop(struct list_head *chain)
517 {
518         /* Traverse backwards: gives us oldest, which is roughly LRU */
519         struct ip_conntrack_tuple_hash *h;
520         struct ip_conntrack *ct = NULL, *tmp;
521         int dropped = 0;
522
523         read_lock_bh(&ip_conntrack_lock);
524         list_for_each_entry_reverse(h, chain, list) {
525                 tmp = tuplehash_to_ctrack(h);
526                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
527                         ct = tmp;
528                         atomic_inc(&ct->ct_general.use);
529                         break;
530                 }
531         }
532         read_unlock_bh(&ip_conntrack_lock);
533
534         if (!ct)
535                 return dropped;
536
537         if (del_timer(&ct->timeout)) {
538                 death_by_timeout((unsigned long)ct);
539                 dropped = 1;
540                 CONNTRACK_STAT_INC(early_drop);
541         }
542         ip_conntrack_put(ct);
543         return dropped;
544 }
545
546 static struct ip_conntrack_helper *
547 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
548 {
549         struct ip_conntrack_helper *h;
550
551         list_for_each_entry(h, &helpers, list) {
552                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
553                         return h;
554         }
555         return NULL;
556 }
557
558 struct ip_conntrack_helper *
559 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
560 {
561         struct ip_conntrack_helper *helper;
562
563         /* need ip_conntrack_lock to assure that helper exists until
564          * try_module_get() is called */
565         read_lock_bh(&ip_conntrack_lock);
566
567         helper = __ip_conntrack_helper_find(tuple);
568         if (helper) {
569                 /* need to increase module usage count to assure helper will
570                  * not go away while the caller is e.g. busy putting a
571                  * conntrack in the hash that uses the helper */
572                 if (!try_module_get(helper->me))
573                         helper = NULL;
574         }
575
576         read_unlock_bh(&ip_conntrack_lock);
577
578         return helper;
579 }
580
581 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
582 {
583         module_put(helper->me);
584 }
585
586 struct ip_conntrack_protocol *
587 __ip_conntrack_proto_find(u_int8_t protocol)
588 {
589         return ip_ct_protos[protocol];
590 }
591
592 /* this is guaranteed to always return a valid protocol helper, since
593  * it falls back to generic_protocol */
594 struct ip_conntrack_protocol *
595 ip_conntrack_proto_find_get(u_int8_t protocol)
596 {
597         struct ip_conntrack_protocol *p;
598
599         preempt_disable();
600         p = __ip_conntrack_proto_find(protocol);
601         if (p) {
602                 if (!try_module_get(p->me))
603                         p = &ip_conntrack_generic_protocol;
604         }
605         preempt_enable();
606         
607         return p;
608 }
609
610 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
611 {
612         module_put(p->me);
613 }
614
615 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
616                                         struct ip_conntrack_tuple *repl)
617 {
618         struct ip_conntrack *conntrack;
619
620         if (!ip_conntrack_hash_rnd_initted) {
621                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
622                 ip_conntrack_hash_rnd_initted = 1;
623         }
624
625         /* We don't want any race condition at early drop stage */
626         atomic_inc(&ip_conntrack_count);
627
628         if (ip_conntrack_max
629             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
630                 unsigned int hash = hash_conntrack(orig);
631                 /* Try dropping from this hash chain. */
632                 if (!early_drop(&ip_conntrack_hash[hash])) {
633                         atomic_dec(&ip_conntrack_count);
634                         if (net_ratelimit())
635                                 printk(KERN_WARNING
636                                        "ip_conntrack: table full, dropping"
637                                        " packet.\n");
638                         return ERR_PTR(-ENOMEM);
639                 }
640         }
641
642         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
643         if (!conntrack) {
644                 DEBUGP("Can't allocate conntrack.\n");
645                 atomic_dec(&ip_conntrack_count);
646                 return ERR_PTR(-ENOMEM);
647         }
648
649         memset(conntrack, 0, sizeof(*conntrack));
650         atomic_set(&conntrack->ct_general.use, 1);
651         conntrack->ct_general.destroy = destroy_conntrack;
652         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
653         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
654         /* Don't set timer yet: wait for confirmation */
655         init_timer(&conntrack->timeout);
656         conntrack->timeout.data = (unsigned long)conntrack;
657         conntrack->timeout.function = death_by_timeout;
658
659         return conntrack;
660 }
661
662 void
663 ip_conntrack_free(struct ip_conntrack *conntrack)
664 {
665         atomic_dec(&ip_conntrack_count);
666         kmem_cache_free(ip_conntrack_cachep, conntrack);
667 }
668
669 /* Allocate a new conntrack: we return -ENOMEM if classification
670  * failed due to stress.   Otherwise it really is unclassifiable */
671 static struct ip_conntrack_tuple_hash *
672 init_conntrack(struct ip_conntrack_tuple *tuple,
673                struct ip_conntrack_protocol *protocol,
674                struct sk_buff *skb)
675 {
676         struct ip_conntrack *conntrack;
677         struct ip_conntrack_tuple repl_tuple;
678         struct ip_conntrack_expect *exp;
679
680         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
681                 DEBUGP("Can't invert tuple.\n");
682                 return NULL;
683         }
684
685         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
686         if (conntrack == NULL || IS_ERR(conntrack))
687                 return (struct ip_conntrack_tuple_hash *)conntrack;
688
689         if (!protocol->new(conntrack, skb)) {
690                 ip_conntrack_free(conntrack);
691                 return NULL;
692         }
693
694         write_lock_bh(&ip_conntrack_lock);
695         exp = find_expectation(tuple);
696
697         if (exp) {
698                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
699                         conntrack, exp);
700                 /* Welcome, Mr. Bond.  We've been expecting you... */
701                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
702                 conntrack->master = exp->master;
703 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
704                 conntrack->mark = exp->master->mark;
705 #endif
706 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
707     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
708                 /* this is ugly, but there is no other place where to put it */
709                 conntrack->nat.masq_index = exp->master->nat.masq_index;
710 #endif
711 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
712                 conntrack->secmark = exp->master->secmark;
713 #endif
714                 nf_conntrack_get(&conntrack->master->ct_general);
715                 CONNTRACK_STAT_INC(expect_new);
716         } else {
717                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
718
719                 CONNTRACK_STAT_INC(new);
720         }
721
722         /* Overload tuple linked list to put us in unconfirmed list. */
723         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
724
725         write_unlock_bh(&ip_conntrack_lock);
726
727         if (exp) {
728                 if (exp->expectfn)
729                         exp->expectfn(conntrack, exp);
730                 ip_conntrack_expect_put(exp);
731         }
732
733         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
734 }
735
736 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
737 static inline struct ip_conntrack *
738 resolve_normal_ct(struct sk_buff *skb,
739                   struct ip_conntrack_protocol *proto,
740                   int *set_reply,
741                   unsigned int hooknum,
742                   enum ip_conntrack_info *ctinfo)
743 {
744         struct ip_conntrack_tuple tuple;
745         struct ip_conntrack_tuple_hash *h;
746         struct ip_conntrack *ct;
747
748         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
749
750         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
751                                 &tuple,proto))
752                 return NULL;
753
754         /* look for tuple match */
755         h = ip_conntrack_find_get(&tuple, NULL);
756         if (!h) {
757                 h = init_conntrack(&tuple, proto, skb);
758                 if (!h)
759                         return NULL;
760                 if (IS_ERR(h))
761                         return (void *)h;
762         }
763         ct = tuplehash_to_ctrack(h);
764
765         /* It exists; we have (non-exclusive) reference. */
766         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
767                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
768                 /* Please set reply bit if this packet OK */
769                 *set_reply = 1;
770         } else {
771                 /* Once we've had two way comms, always ESTABLISHED. */
772                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
773                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
774                                ct);
775                         *ctinfo = IP_CT_ESTABLISHED;
776                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
777                         DEBUGP("ip_conntrack_in: related packet for %p\n",
778                                ct);
779                         *ctinfo = IP_CT_RELATED;
780                 } else {
781                         DEBUGP("ip_conntrack_in: new packet for %p\n",
782                                ct);
783                         *ctinfo = IP_CT_NEW;
784                 }
785                 *set_reply = 0;
786         }
787         skb->nfct = &ct->ct_general;
788         skb->nfctinfo = *ctinfo;
789         return ct;
790 }
791
792 /* Netfilter hook itself. */
793 unsigned int ip_conntrack_in(unsigned int hooknum,
794                              struct sk_buff **pskb,
795                              const struct net_device *in,
796                              const struct net_device *out,
797                              int (*okfn)(struct sk_buff *))
798 {
799         struct ip_conntrack *ct;
800         enum ip_conntrack_info ctinfo;
801         struct ip_conntrack_protocol *proto;
802         int set_reply = 0;
803         int ret;
804
805         /* Previously seen (loopback or untracked)?  Ignore. */
806         if ((*pskb)->nfct) {
807                 CONNTRACK_STAT_INC(ignore);
808                 return NF_ACCEPT;
809         }
810
811         /* Never happen */
812         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
813                 if (net_ratelimit()) {
814                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
815                        (*pskb)->nh.iph->protocol, hooknum);
816                 }
817                 return NF_DROP;
818         }
819
820 /* Doesn't cover locally-generated broadcast, so not worth it. */
821 #if 0
822         /* Ignore broadcast: no `connection'. */
823         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
824                 printk("Broadcast packet!\n");
825                 return NF_ACCEPT;
826         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
827                    == htonl(0x000000FF)) {
828                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
829                        NIPQUAD((*pskb)->nh.iph->saddr),
830                        NIPQUAD((*pskb)->nh.iph->daddr),
831                        (*pskb)->sk, (*pskb)->pkt_type);
832         }
833 #endif
834
835         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
836
837         /* It may be an special packet, error, unclean...
838          * inverse of the return code tells to the netfilter
839          * core what to do with the packet. */
840         if (proto->error != NULL 
841             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
842                 CONNTRACK_STAT_INC(error);
843                 CONNTRACK_STAT_INC(invalid);
844                 return -ret;
845         }
846
847         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
848                 /* Not valid part of a connection */
849                 CONNTRACK_STAT_INC(invalid);
850                 return NF_ACCEPT;
851         }
852
853         if (IS_ERR(ct)) {
854                 /* Too stressed to deal. */
855                 CONNTRACK_STAT_INC(drop);
856                 return NF_DROP;
857         }
858
859         IP_NF_ASSERT((*pskb)->nfct);
860
861         ret = proto->packet(ct, *pskb, ctinfo);
862         if (ret < 0) {
863                 /* Invalid: inverse of the return code tells
864                  * the netfilter core what to do*/
865                 nf_conntrack_put((*pskb)->nfct);
866                 (*pskb)->nfct = NULL;
867                 CONNTRACK_STAT_INC(invalid);
868                 return -ret;
869         }
870
871         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
872                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
873
874         return ret;
875 }
876
877 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
878                    const struct ip_conntrack_tuple *orig)
879 {
880         return ip_ct_invert_tuple(inverse, orig, 
881                                   __ip_conntrack_proto_find(orig->dst.protonum));
882 }
883
884 /* Would two expected things clash? */
885 static inline int expect_clash(const struct ip_conntrack_expect *a,
886                                const struct ip_conntrack_expect *b)
887 {
888         /* Part covered by intersection of masks must be unequal,
889            otherwise they clash */
890         struct ip_conntrack_tuple intersect_mask
891                 = { { a->mask.src.ip & b->mask.src.ip,
892                       { a->mask.src.u.all & b->mask.src.u.all } },
893                     { a->mask.dst.ip & b->mask.dst.ip,
894                       { a->mask.dst.u.all & b->mask.dst.u.all },
895                       a->mask.dst.protonum & b->mask.dst.protonum } };
896
897         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
898 }
899
900 static inline int expect_matches(const struct ip_conntrack_expect *a,
901                                  const struct ip_conntrack_expect *b)
902 {
903         return a->master == b->master
904                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
905                 && ip_ct_tuple_equal(&a->mask, &b->mask);
906 }
907
908 /* Generally a bad idea to call this: could have matched already. */
909 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
910 {
911         struct ip_conntrack_expect *i;
912
913         write_lock_bh(&ip_conntrack_lock);
914         /* choose the the oldest expectation to evict */
915         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
916                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
917                         ip_ct_unlink_expect(i);
918                         write_unlock_bh(&ip_conntrack_lock);
919                         ip_conntrack_expect_put(i);
920                         return;
921                 }
922         }
923         write_unlock_bh(&ip_conntrack_lock);
924 }
925
926 /* We don't increase the master conntrack refcount for non-fulfilled
927  * conntracks. During the conntrack destruction, the expectations are 
928  * always killed before the conntrack itself */
929 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
930 {
931         struct ip_conntrack_expect *new;
932
933         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
934         if (!new) {
935                 DEBUGP("expect_related: OOM allocating expect\n");
936                 return NULL;
937         }
938         new->master = me;
939         atomic_set(&new->use, 1);
940         return new;
941 }
942
943 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
944 {
945         if (atomic_dec_and_test(&exp->use))
946                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
947 }
948
949 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
950 {
951         atomic_inc(&exp->use);
952         exp->master->expecting++;
953         list_add(&exp->list, &ip_conntrack_expect_list);
954
955         init_timer(&exp->timeout);
956         exp->timeout.data = (unsigned long)exp;
957         exp->timeout.function = expectation_timed_out;
958         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
959         add_timer(&exp->timeout);
960
961         exp->id = ++ip_conntrack_expect_next_id;
962         atomic_inc(&exp->use);
963         CONNTRACK_STAT_INC(expect_create);
964 }
965
966 /* Race with expectations being used means we could have none to find; OK. */
967 static void evict_oldest_expect(struct ip_conntrack *master)
968 {
969         struct ip_conntrack_expect *i;
970
971         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
972                 if (i->master == master) {
973                         if (del_timer(&i->timeout)) {
974                                 ip_ct_unlink_expect(i);
975                                 ip_conntrack_expect_put(i);
976                         }
977                         break;
978                 }
979         }
980 }
981
982 static inline int refresh_timer(struct ip_conntrack_expect *i)
983 {
984         if (!del_timer(&i->timeout))
985                 return 0;
986
987         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
988         add_timer(&i->timeout);
989         return 1;
990 }
991
992 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
993 {
994         struct ip_conntrack_expect *i;
995         int ret;
996
997         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
998         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
999         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1000
1001         write_lock_bh(&ip_conntrack_lock);
1002         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1003                 if (expect_matches(i, expect)) {
1004                         /* Refresh timer: if it's dying, ignore.. */
1005                         if (refresh_timer(i)) {
1006                                 ret = 0;
1007                                 goto out;
1008                         }
1009                 } else if (expect_clash(i, expect)) {
1010                         ret = -EBUSY;
1011                         goto out;
1012                 }
1013         }
1014
1015         /* Will be over limit? */
1016         if (expect->master->helper->max_expected && 
1017             expect->master->expecting >= expect->master->helper->max_expected)
1018                 evict_oldest_expect(expect->master);
1019
1020         ip_conntrack_expect_insert(expect);
1021         ip_conntrack_expect_event(IPEXP_NEW, expect);
1022         ret = 0;
1023 out:
1024         write_unlock_bh(&ip_conntrack_lock);
1025         return ret;
1026 }
1027
1028 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1029    implicitly racy: see __ip_conntrack_confirm */
1030 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1031                               const struct ip_conntrack_tuple *newreply)
1032 {
1033         write_lock_bh(&ip_conntrack_lock);
1034         /* Should be unconfirmed, so not in hash table yet */
1035         IP_NF_ASSERT(!is_confirmed(conntrack));
1036
1037         DEBUGP("Altering reply tuple of %p to ", conntrack);
1038         DUMP_TUPLE(newreply);
1039
1040         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1041         if (!conntrack->master && conntrack->expecting == 0)
1042                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1043         write_unlock_bh(&ip_conntrack_lock);
1044 }
1045
1046 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1047 {
1048         BUG_ON(me->timeout == 0);
1049         write_lock_bh(&ip_conntrack_lock);
1050         list_add(&me->list, &helpers);
1051         write_unlock_bh(&ip_conntrack_lock);
1052
1053         return 0;
1054 }
1055
1056 struct ip_conntrack_helper *
1057 __ip_conntrack_helper_find_byname(const char *name)
1058 {
1059         struct ip_conntrack_helper *h;
1060
1061         list_for_each_entry(h, &helpers, list) {
1062                 if (!strcmp(h->name, name))
1063                         return h;
1064         }
1065
1066         return NULL;
1067 }
1068
1069 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1070                           const struct ip_conntrack_helper *me)
1071 {
1072         if (tuplehash_to_ctrack(i)->helper == me) {
1073                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1074                 tuplehash_to_ctrack(i)->helper = NULL;
1075         }
1076 }
1077
1078 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1079 {
1080         unsigned int i;
1081         struct ip_conntrack_tuple_hash *h;
1082         struct ip_conntrack_expect *exp, *tmp;
1083
1084         /* Need write lock here, to delete helper. */
1085         write_lock_bh(&ip_conntrack_lock);
1086         list_del(&me->list);
1087
1088         /* Get rid of expectations */
1089         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1090                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1091                         ip_ct_unlink_expect(exp);
1092                         ip_conntrack_expect_put(exp);
1093                 }
1094         }
1095         /* Get rid of expecteds, set helpers to NULL. */
1096         list_for_each_entry(h, &unconfirmed, list)
1097                 unhelp(h, me);
1098         for (i = 0; i < ip_conntrack_htable_size; i++) {
1099                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1100                         unhelp(h, me);
1101         }
1102         write_unlock_bh(&ip_conntrack_lock);
1103
1104         /* Someone could be still looking at the helper in a bh. */
1105         synchronize_net();
1106 }
1107
1108 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1109 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1110                         enum ip_conntrack_info ctinfo,
1111                         const struct sk_buff *skb,
1112                         unsigned long extra_jiffies,
1113                         int do_acct)
1114 {
1115         int event = 0;
1116
1117         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1118         IP_NF_ASSERT(skb);
1119
1120         write_lock_bh(&ip_conntrack_lock);
1121
1122         /* Only update if this is not a fixed timeout */
1123         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1124                 write_unlock_bh(&ip_conntrack_lock);
1125                 return;
1126         }
1127
1128         /* If not in hash table, timer will not be active yet */
1129         if (!is_confirmed(ct)) {
1130                 ct->timeout.expires = extra_jiffies;
1131                 event = IPCT_REFRESH;
1132         } else {
1133                 /* Need del_timer for race avoidance (may already be dying). */
1134                 if (del_timer(&ct->timeout)) {
1135                         ct->timeout.expires = jiffies + extra_jiffies;
1136                         add_timer(&ct->timeout);
1137                         event = IPCT_REFRESH;
1138                 }
1139         }
1140
1141 #ifdef CONFIG_IP_NF_CT_ACCT
1142         if (do_acct) {
1143                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1144                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1145                                                 ntohs(skb->nh.iph->tot_len);
1146                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1147                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1148                         event |= IPCT_COUNTER_FILLING;
1149         }
1150 #endif
1151
1152         write_unlock_bh(&ip_conntrack_lock);
1153
1154         /* must be unlocked when calling event cache */
1155         if (event)
1156                 ip_conntrack_event_cache(event, skb);
1157 }
1158
1159 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1160     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1161 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1162  * in ip_conntrack_core, since we don't want the protocols to autoload
1163  * or depend on ctnetlink */
1164 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1165                                const struct ip_conntrack_tuple *tuple)
1166 {
1167         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1168                 &tuple->src.u.tcp.port);
1169         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1170                 &tuple->dst.u.tcp.port);
1171         return 0;
1172
1173 nfattr_failure:
1174         return -1;
1175 }
1176
1177 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1178                                struct ip_conntrack_tuple *t)
1179 {
1180         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1181                 return -EINVAL;
1182
1183         t->src.u.tcp.port =
1184                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1185         t->dst.u.tcp.port =
1186                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1187
1188         return 0;
1189 }
1190 #endif
1191
1192 /* Returns new sk_buff, or NULL */
1193 struct sk_buff *
1194 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1195 {
1196         skb_orphan(skb);
1197
1198         local_bh_disable(); 
1199         skb = ip_defrag(skb, user);
1200         local_bh_enable();
1201
1202         if (skb)
1203                 ip_send_check(skb->nh.iph);
1204         return skb;
1205 }
1206
1207 /* Used by ipt_REJECT. */
1208 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1209 {
1210         struct ip_conntrack *ct;
1211         enum ip_conntrack_info ctinfo;
1212
1213         /* This ICMP is in reverse direction to the packet which caused it */
1214         ct = ip_conntrack_get(skb, &ctinfo);
1215         
1216         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1217                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1218         else
1219                 ctinfo = IP_CT_RELATED;
1220
1221         /* Attach to new skbuff, and increment count */
1222         nskb->nfct = &ct->ct_general;
1223         nskb->nfctinfo = ctinfo;
1224         nf_conntrack_get(nskb->nfct);
1225 }
1226
1227 /* Bring out ya dead! */
1228 static struct ip_conntrack *
1229 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1230                 void *data, unsigned int *bucket)
1231 {
1232         struct ip_conntrack_tuple_hash *h;
1233         struct ip_conntrack *ct;
1234
1235         write_lock_bh(&ip_conntrack_lock);
1236         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1237                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1238                         ct = tuplehash_to_ctrack(h);
1239                         if (iter(ct, data))
1240                                 goto found;
1241                 }
1242         }
1243         list_for_each_entry(h, &unconfirmed, list) {
1244                 ct = tuplehash_to_ctrack(h);
1245                 if (iter(ct, data))
1246                         goto found;
1247         }
1248         write_unlock_bh(&ip_conntrack_lock);
1249         return NULL;
1250
1251 found:
1252         atomic_inc(&ct->ct_general.use);
1253         write_unlock_bh(&ip_conntrack_lock);
1254         return ct;
1255 }
1256
1257 void
1258 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1259 {
1260         struct ip_conntrack *ct;
1261         unsigned int bucket = 0;
1262
1263         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1264                 /* Time to push up daises... */
1265                 if (del_timer(&ct->timeout))
1266                         death_by_timeout((unsigned long)ct);
1267                 /* ... else the timer will get him soon. */
1268
1269                 ip_conntrack_put(ct);
1270         }
1271 }
1272
1273 /* Fast function for those who don't want to parse /proc (and I don't
1274    blame them). */
1275 /* Reversing the socket's dst/src point of view gives us the reply
1276    mapping. */
1277 static int
1278 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1279 {
1280         struct inet_sock *inet = inet_sk(sk);
1281         struct ip_conntrack_tuple_hash *h;
1282         struct ip_conntrack_tuple tuple;
1283         
1284         IP_CT_TUPLE_U_BLANK(&tuple);
1285         tuple.src.ip = inet->rcv_saddr;
1286         tuple.src.u.tcp.port = inet->sport;
1287         tuple.dst.ip = inet->daddr;
1288         tuple.dst.u.tcp.port = inet->dport;
1289         tuple.dst.protonum = IPPROTO_TCP;
1290
1291         /* We only do TCP at the moment: is there a better way? */
1292         if (strcmp(sk->sk_prot->name, "TCP")) {
1293                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1294                 return -ENOPROTOOPT;
1295         }
1296
1297         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1298                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1299                        *len, sizeof(struct sockaddr_in));
1300                 return -EINVAL;
1301         }
1302
1303         h = ip_conntrack_find_get(&tuple, NULL);
1304         if (h) {
1305                 struct sockaddr_in sin;
1306                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1307
1308                 sin.sin_family = AF_INET;
1309                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310                         .tuple.dst.u.tcp.port;
1311                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1312                         .tuple.dst.ip;
1313                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1314
1315                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1316                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1317                 ip_conntrack_put(ct);
1318                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1319                         return -EFAULT;
1320                 else
1321                         return 0;
1322         }
1323         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1324                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1325                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1326         return -ENOENT;
1327 }
1328
1329 static struct nf_sockopt_ops so_getorigdst = {
1330         .pf             = PF_INET,
1331         .get_optmin     = SO_ORIGINAL_DST,
1332         .get_optmax     = SO_ORIGINAL_DST+1,
1333         .get            = &getorigdst,
1334 };
1335
1336 static int kill_all(struct ip_conntrack *i, void *data)
1337 {
1338         return 1;
1339 }
1340
1341 void ip_conntrack_flush(void)
1342 {
1343         ip_ct_iterate_cleanup(kill_all, NULL);
1344 }
1345
1346 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1347 {
1348         if (vmalloced)
1349                 vfree(hash);
1350         else
1351                 free_pages((unsigned long)hash, 
1352                            get_order(sizeof(struct list_head) * size));
1353 }
1354
1355 /* Mishearing the voices in his head, our hero wonders how he's
1356    supposed to kill the mall. */
1357 void ip_conntrack_cleanup(void)
1358 {
1359         ip_ct_attach = NULL;
1360
1361         /* This makes sure all current packets have passed through
1362            netfilter framework.  Roll on, two-stage module
1363            delete... */
1364         synchronize_net();
1365
1366         ip_ct_event_cache_flush();
1367  i_see_dead_people:
1368         ip_conntrack_flush();
1369         if (atomic_read(&ip_conntrack_count) != 0) {
1370                 schedule();
1371                 goto i_see_dead_people;
1372         }
1373         /* wait until all references to ip_conntrack_untracked are dropped */
1374         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1375                 schedule();
1376
1377         kmem_cache_destroy(ip_conntrack_cachep);
1378         kmem_cache_destroy(ip_conntrack_expect_cachep);
1379         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1380                             ip_conntrack_htable_size);
1381         nf_unregister_sockopt(&so_getorigdst);
1382 }
1383
1384 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1385 {
1386         struct list_head *hash;
1387         unsigned int i;
1388
1389         *vmalloced = 0; 
1390         hash = (void*)__get_free_pages(GFP_KERNEL, 
1391                                        get_order(sizeof(struct list_head)
1392                                                  * size));
1393         if (!hash) { 
1394                 *vmalloced = 1;
1395                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1396                 hash = vmalloc(sizeof(struct list_head) * size);
1397         }
1398
1399         if (hash)
1400                 for (i = 0; i < size; i++)
1401                         INIT_LIST_HEAD(&hash[i]);
1402
1403         return hash;
1404 }
1405
1406 static int set_hashsize(const char *val, struct kernel_param *kp)
1407 {
1408         int i, bucket, hashsize, vmalloced;
1409         int old_vmalloced, old_size;
1410         int rnd;
1411         struct list_head *hash, *old_hash;
1412         struct ip_conntrack_tuple_hash *h;
1413
1414         /* On boot, we can set this without any fancy locking. */
1415         if (!ip_conntrack_htable_size)
1416                 return param_set_int(val, kp);
1417
1418         hashsize = simple_strtol(val, NULL, 0);
1419         if (!hashsize)
1420                 return -EINVAL;
1421
1422         hash = alloc_hashtable(hashsize, &vmalloced);
1423         if (!hash)
1424                 return -ENOMEM;
1425
1426         /* We have to rehash for the new table anyway, so we also can 
1427          * use a new random seed */
1428         get_random_bytes(&rnd, 4);
1429
1430         write_lock_bh(&ip_conntrack_lock);
1431         for (i = 0; i < ip_conntrack_htable_size; i++) {
1432                 while (!list_empty(&ip_conntrack_hash[i])) {
1433                         h = list_entry(ip_conntrack_hash[i].next,
1434                                        struct ip_conntrack_tuple_hash, list);
1435                         list_del(&h->list);
1436                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1437                         list_add_tail(&h->list, &hash[bucket]);
1438                 }
1439         }
1440         old_size = ip_conntrack_htable_size;
1441         old_vmalloced = ip_conntrack_vmalloc;
1442         old_hash = ip_conntrack_hash;
1443
1444         ip_conntrack_htable_size = hashsize;
1445         ip_conntrack_vmalloc = vmalloced;
1446         ip_conntrack_hash = hash;
1447         ip_conntrack_hash_rnd = rnd;
1448         write_unlock_bh(&ip_conntrack_lock);
1449
1450         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1451         return 0;
1452 }
1453
1454 module_param_call(hashsize, set_hashsize, param_get_uint,
1455                   &ip_conntrack_htable_size, 0600);
1456
1457 int __init ip_conntrack_init(void)
1458 {
1459         unsigned int i;
1460         int ret;
1461
1462         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1463          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1464         if (!ip_conntrack_htable_size) {
1465                 ip_conntrack_htable_size
1466                         = (((num_physpages << PAGE_SHIFT) / 16384)
1467                            / sizeof(struct list_head));
1468                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1469                         ip_conntrack_htable_size = 8192;
1470                 if (ip_conntrack_htable_size < 16)
1471                         ip_conntrack_htable_size = 16;
1472         }
1473         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1474
1475         printk("ip_conntrack version %s (%u buckets, %d max)"
1476                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1477                ip_conntrack_htable_size, ip_conntrack_max,
1478                sizeof(struct ip_conntrack));
1479
1480         ret = nf_register_sockopt(&so_getorigdst);
1481         if (ret != 0) {
1482                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1483                 return ret;
1484         }
1485
1486         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1487                                             &ip_conntrack_vmalloc);
1488         if (!ip_conntrack_hash) {
1489                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1490                 goto err_unreg_sockopt;
1491         }
1492
1493         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1494                                                 sizeof(struct ip_conntrack), 0,
1495                                                 0, NULL, NULL);
1496         if (!ip_conntrack_cachep) {
1497                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1498                 goto err_free_hash;
1499         }
1500
1501         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1502                                         sizeof(struct ip_conntrack_expect),
1503                                         0, 0, NULL, NULL);
1504         if (!ip_conntrack_expect_cachep) {
1505                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1506                 goto err_free_conntrack_slab;
1507         }
1508
1509         /* Don't NEED lock here, but good form anyway. */
1510         write_lock_bh(&ip_conntrack_lock);
1511         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1512                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1513         /* Sew in builtin protocols. */
1514         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1515         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1516         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1517         write_unlock_bh(&ip_conntrack_lock);
1518
1519         /* For use by ipt_REJECT */
1520         ip_ct_attach = ip_conntrack_attach;
1521
1522         /* Set up fake conntrack:
1523             - to never be deleted, not in any hashes */
1524         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1525         /*  - and look it like as a confirmed connection */
1526         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1527
1528         return ret;
1529
1530 err_free_conntrack_slab:
1531         kmem_cache_destroy(ip_conntrack_cachep);
1532 err_free_hash:
1533         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1534                             ip_conntrack_htable_size);
1535 err_unreg_sockopt:
1536         nf_unregister_sockopt(&so_getorigdst);
1537
1538         return -ENOMEM;
1539 }