net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen semantics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/xfrm.h>
  73
  74 #include <linux/inet.h>
  75 #include <linux/ipv6.h>
  76 #include <linux/stddef.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/seq_file.h>
  79
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96 };
  97
  98 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
  99 {
 100         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 101 }
 102
 103 static void tcp_v4_hash(struct sock *sk)
 104 {
 105         inet_hash(&tcp_hashinfo, sk);
 106 }
 107
 108 void tcp_unhash(struct sock *sk)
 109 {
 110         inet_unhash(&tcp_hashinfo, sk);
 111 }
 112
 113 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 114 {
 115         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 116                                           skb->nh.iph->saddr,
 117                                           skb->h.th->dest,
 118                                           skb->h.th->source);
 119 }
 120
 121 /* called with local bh disabled */
 122 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 123                                       struct inet_timewait_sock **twp)
 124 {
 125         struct inet_sock *inet = inet_sk(sk);
 126         u32 daddr = inet->rcv_saddr;
 127         u32 saddr = inet->daddr;
 128         int dif = sk->sk_bound_dev_if;
 129         INET_ADDR_COOKIE(acookie, saddr, daddr)
 130         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 131         unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 132         struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
 133         struct sock *sk2;
 134         const struct hlist_node *node;
 135         struct inet_timewait_sock *tw;
 136
 137         prefetch(head->chain.first);
 138         write_lock(&head->lock);
 139
 140         /* Check TIME-WAIT sockets first. */
 141         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 142                 tw = inet_twsk(sk2);
 143
 144                 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
 145                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 146                         struct tcp_sock *tp = tcp_sk(sk);
 147
 148                         /* With PAWS, it is safe from the viewpoint
 149                            of data integrity. Even without PAWS it
 150                            is safe provided sequence spaces do not
 151                            overlap i.e. at data rates <= 80Mbit/sec.
 152
 153                            Actually, the idea is close to VJ's one,
 154                            only timestamp cache is held not per host,
 155                            but per port pair and TW bucket is used
 156                            as state holder.
 157
 158                            If TW bucket has been already destroyed we
 159                            fall back to VJ's scheme and use initial
 160                            timestamp retrieved from peer table.
 161                          */
 162                         if (tcptw->tw_ts_recent_stamp &&
 163                             (!twp || (sysctl_tcp_tw_reuse &&
 164                                       xtime.tv_sec -
 165                                       tcptw->tw_ts_recent_stamp > 1))) {
 166                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 167                                 if (tp->write_seq == 0)
 168                                         tp->write_seq = 1;
 169                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 170                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 171                                 sock_hold(sk2);
 172                                 goto unique;
 173                         } else
 174                                 goto not_unique;
 175                 }
 176         }
 177         tw = NULL;
 178
 179         /* And established part... */
 180         sk_for_each(sk2, node, &head->chain) {
 181                 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
 182                         goto not_unique;
 183         }
 184
 185 unique:
 186         /* Must record num and sport now. Otherwise we will see
 187          * in hash table socket with a funny identity. */
 188         inet->num = lport;
 189         inet->sport = htons(lport);
 190         sk->sk_hash = hash;
 191         BUG_TRAP(sk_unhashed(sk));
 192         __sk_add_node(sk, &head->chain);
 193         sock_prot_inc_use(sk->sk_prot);
 194         write_unlock(&head->lock);
 195
 196         if (twp) {
 197                 *twp = tw;
 198                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 199         } else if (tw) {
 200                 /* Silly. Should hash-dance instead... */
 201                 inet_twsk_deschedule(tw, &tcp_death_row);
 202                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 203
 204                 inet_twsk_put(tw);
 205         }
 206
 207         return 0;
 208
 209 not_unique:
 210         write_unlock(&head->lock);
 211         return -EADDRNOTAVAIL;
 212 }
 213
 214 static inline u32 connect_port_offset(const struct sock *sk)
 215 {
 216         const struct inet_sock *inet = inet_sk(sk);
 217
 218         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 219                                          inet->dport);
 220 }
 221
 222 /*
 223  * Bind a port for a connect operation and hash it.
 224  */
 225 static inline int tcp_v4_hash_connect(struct sock *sk)
 226 {
 227         const unsigned short snum = inet_sk(sk)->num;
 228         struct inet_bind_hashbucket *head;
 229         struct inet_bind_bucket *tb;
 230         int ret;
 231
 232         if (!snum) {
 233                 int low = sysctl_local_port_range[0];
 234                 int high = sysctl_local_port_range[1];
 235                 int range = high - low;
 236                 int i;
 237                 int port;
 238                 static u32 hint;
 239                 u32 offset = hint + connect_port_offset(sk);
 240                 struct hlist_node *node;
 241                 struct inet_timewait_sock *tw = NULL;
 242
 243                 local_bh_disable();
 244                 for (i = 1; i <= range; i++) {
 245                         port = low + (i + offset) % range;
 246                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 247                         spin_lock(&head->lock);
 248
 249                         /* Does not bother with rcv_saddr checks,
 250                          * because the established check is already
 251                          * unique enough.
 252                          */
 253                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 254                                 if (tb->port == port) {
 255                                         BUG_TRAP(!hlist_empty(&tb->owners));
 256                                         if (tb->fastreuse >= 0)
 257                                                 goto next_port;
 258                                         if (!__tcp_v4_check_established(sk,
 259                                                                         port,
 260                                                                         &tw))
 261                                                 goto ok;
 262                                         goto next_port;
 263                                 }
 264                         }
 265
 266                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 267                         if (!tb) {
 268                                 spin_unlock(&head->lock);
 269                                 break;
 270                         }
 271                         tb->fastreuse = -1;
 272                         goto ok;
 273
 274                 next_port:
 275                         spin_unlock(&head->lock);
 276                 }
 277                 local_bh_enable();
 278
 279                 return -EADDRNOTAVAIL;
 280
 281 ok:
 282                 hint += i;
 283
 284                 /* Head lock still held and bh's disabled */
 285                 inet_bind_hash(sk, tb, port);
 286                 if (sk_unhashed(sk)) {
 287                         inet_sk(sk)->sport = htons(port);
 288                         __inet_hash(&tcp_hashinfo, sk, 0);
 289                 }
 290                 spin_unlock(&head->lock);
 291
 292                 if (tw) {
 293                         inet_twsk_deschedule(tw, &tcp_death_row);;
 294                         inet_twsk_put(tw);
 295                 }
 296
 297                 ret = 0;
 298                 goto out;
 299         }
 300
 301         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 302         tb  = inet_csk(sk)->icsk_bind_hash;
 303         spin_lock_bh(&head->lock);
 304         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 305                 __inet_hash(&tcp_hashinfo, sk, 0);
 306                 spin_unlock_bh(&head->lock);
 307                 return 0;
 308         } else {
 309                 spin_unlock(&head->lock);
 310                 /* No definite answer... Walk to established hash table */
 311                 ret = __tcp_v4_check_established(sk, snum, NULL);
 312 out:
 313                 local_bh_enable();
 314                 return ret;
 315         }
 316 }
 317
 318 /* This will initiate an outgoing connection. */
 319 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 320 {
 321         struct inet_sock *inet = inet_sk(sk);
 322         struct tcp_sock *tp = tcp_sk(sk);
 323         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 324         struct rtable *rt;
 325         u32 daddr, nexthop;
 326         int tmp;
 327         int err;
 328
 329         if (addr_len < sizeof(struct sockaddr_in))
 330                 return -EINVAL;
 331
 332         if (usin->sin_family != AF_INET)
 333                 return -EAFNOSUPPORT;
 334
 335         nexthop = daddr = usin->sin_addr.s_addr;
 336         if (inet->opt && inet->opt->srr) {
 337                 if (!daddr)
 338                         return -EINVAL;
 339                 nexthop = inet->opt->faddr;
 340         }
 341
 342         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 343                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 344                                IPPROTO_TCP,
 345                                inet->sport, usin->sin_port, sk);
 346         if (tmp < 0)
 347                 return tmp;
 348
 349         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 350                 ip_rt_put(rt);
 351                 return -ENETUNREACH;
 352         }
 353
 354         if (!inet->opt || !inet->opt->srr)
 355                 daddr = rt->rt_dst;
 356
 357         if (!inet->saddr)
 358                 inet->saddr = rt->rt_src;
 359         inet->rcv_saddr = inet->saddr;
 360
 361         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 362                 /* Reset inherited state */
 363                 tp->rx_opt.ts_recent       = 0;
 364                 tp->rx_opt.ts_recent_stamp = 0;
 365                 tp->write_seq              = 0;
 366         }
 367
 368         if (tcp_death_row.sysctl_tw_recycle &&
 369             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 370                 struct inet_peer *peer = rt_get_peer(rt);
 371
 372                 /* VJ's idea. We save last timestamp seen from
 373                  * the destination in peer table, when entering state TIME-WAIT
 374                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 375                  */
 376
 377                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 378                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 379                         tp->rx_opt.ts_recent = peer->tcp_ts;
 380                 }
 381         }
 382
 383         inet->dport = usin->sin_port;
 384         inet->daddr = daddr;
 385
 386         tp->ext_header_len = 0;
 387         if (inet->opt)
 388                 tp->ext_header_len = inet->opt->optlen;
 389
 390         tp->rx_opt.mss_clamp = 536;
 391
 392         /* Socket identity is still unknown (sport may be zero).
 393          * However we set state to SYN-SENT and not releasing socket
 394          * lock select source port, enter ourselves into the hash tables and
 395          * complete initialization after this.
 396          */
 397         tcp_set_state(sk, TCP_SYN_SENT);
 398         err = tcp_v4_hash_connect(sk);
 399         if (err)
 400                 goto failure;
 401
 402         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 403         if (err)
 404                 goto failure;
 405
 406         /* OK, now commit destination to socket.  */
 407         sk_setup_caps(sk, &rt->u.dst);
 408
 409         if (!tp->write_seq)
 410                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 411                                                            inet->daddr,
 412                                                            inet->sport,
 413                                                            usin->sin_port);
 414
 415         inet->id = tp->write_seq ^ jiffies;
 416
 417         err = tcp_connect(sk);
 418         rt = NULL;
 419         if (err)
 420                 goto failure;
 421
 422         return 0;
 423
 424 failure:
 425         /* This unhashes the socket and releases the local port, if necessary. */
 426         tcp_set_state(sk, TCP_CLOSE);
 427         ip_rt_put(rt);
 428         sk->sk_route_caps = 0;
 429         inet->dport = 0;
 430         return err;
 431 }
 432
 433 /*
 434  * This routine does path mtu discovery as defined in RFC1191.
 435  */
 436 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 437                                      u32 mtu)
 438 {
 439         struct dst_entry *dst;
 440         struct inet_sock *inet = inet_sk(sk);
 441         struct tcp_sock *tp = tcp_sk(sk);
 442
 443         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 444          * send out by Linux are always <576bytes so they should go through
 445          * unfragmented).
 446          */
 447         if (sk->sk_state == TCP_LISTEN)
 448                 return;
 449
 450         /* We don't check in the destentry if pmtu discovery is forbidden
 451          * on this route. We just assume that no packet_to_big packets
 452          * are send back when pmtu discovery is not active.
 453          * There is a small race when the user changes this flag in the
 454          * route, but I think that's acceptable.
 455          */
 456         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 457                 return;
 458
 459         dst->ops->update_pmtu(dst, mtu);
 460
 461         /* Something is about to be wrong... Remember soft error
 462          * for the case, if this connection will not able to recover.
 463          */
 464         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 465                 sk->sk_err_soft = EMSGSIZE;
 466
 467         mtu = dst_mtu(dst);
 468
 469         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 470             tp->pmtu_cookie > mtu) {
 471                 tcp_sync_mss(sk, mtu);
 472
 473                 /* Resend the TCP packet because it's
 474                  * clear that the old packet has been
 475                  * dropped. This is the new "fast" path mtu
 476                  * discovery.
 477                  */
 478                 tcp_simple_retransmit(sk);
 479         } /* else let the usual retransmit timer handle it */
 480 }
 481
 482 /*
 483  * This routine is called by the ICMP module when it gets some
 484  * sort of error condition.  If err < 0 then the socket should
 485  * be closed and the error returned to the user.  If err > 0
 486  * it's just the icmp type << 8 | icmp code.  After adjustment
 487  * header points to the first 8 bytes of the tcp header.  We need
 488  * to find the appropriate port.
 489  *
 490  * The locking strategy used here is very "optimistic". When
 491  * someone else accesses the socket the ICMP is just dropped
 492  * and for some paths there is no check at all.
 493  * A more general error queue to queue errors for later handling
 494  * is probably better.
 495  *
 496  */
 497
 498 void tcp_v4_err(struct sk_buff *skb, u32 info)
 499 {
 500         struct iphdr *iph = (struct iphdr *)skb->data;
 501         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 502         struct tcp_sock *tp;
 503         struct inet_sock *inet;
 504         int type = skb->h.icmph->type;
 505         int code = skb->h.icmph->code;
 506         struct sock *sk;
 507         __u32 seq;
 508         int err;
 509
 510         if (skb->len < (iph->ihl << 2) + 8) {
 511                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 512                 return;
 513         }
 514
 515         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 516                          th->source, inet_iif(skb));
 517         if (!sk) {
 518                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 519                 return;
 520         }
 521         if (sk->sk_state == TCP_TIME_WAIT) {
 522                 inet_twsk_put((struct inet_timewait_sock *)sk);
 523                 return;
 524         }
 525
 526         bh_lock_sock(sk);
 527         /* If too many ICMPs get dropped on busy
 528          * servers this needs to be solved differently.
 529          */
 530         if (sock_owned_by_user(sk))
 531                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 532
 533         if (sk->sk_state == TCP_CLOSE)
 534                 goto out;
 535
 536         tp = tcp_sk(sk);
 537         seq = ntohl(th->seq);
 538         if (sk->sk_state != TCP_LISTEN &&
 539             !between(seq, tp->snd_una, tp->snd_nxt)) {
 540                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 541                 goto out;
 542         }
 543
 544         switch (type) {
 545         case ICMP_SOURCE_QUENCH:
 546                 /* Just silently ignore these. */
 547                 goto out;
 548         case ICMP_PARAMETERPROB:
 549                 err = EPROTO;
 550                 break;
 551         case ICMP_DEST_UNREACH:
 552                 if (code > NR_ICMP_UNREACH)
 553                         goto out;
 554
 555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 556                         if (!sock_owned_by_user(sk))
 557                                 do_pmtu_discovery(sk, iph, info);
 558                         goto out;
 559                 }
 560
 561                 err = icmp_err_convert[code].errno;
 562                 break;
 563         case ICMP_TIME_EXCEEDED:
 564                 err = EHOSTUNREACH;
 565                 break;
 566         default:
 567                 goto out;
 568         }
 569
 570         switch (sk->sk_state) {
 571                 struct request_sock *req, **prev;
 572         case TCP_LISTEN:
 573                 if (sock_owned_by_user(sk))
 574                         goto out;
 575
 576                 req = inet_csk_search_req(sk, &prev, th->dest,
 577                                           iph->daddr, iph->saddr);
 578                 if (!req)
 579                         goto out;
 580
 581                 /* ICMPs are not backlogged, hence we cannot get
 582                    an established socket here.
 583                  */
 584                 BUG_TRAP(!req->sk);
 585
 586                 if (seq != tcp_rsk(req)->snt_isn) {
 587                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 588                         goto out;
 589                 }
 590
 591                 /*
 592                  * Still in SYN_RECV, just remove it silently.
 593                  * There is no good way to pass the error to the newly
 594                  * created socket, and POSIX does not want network
 595                  * errors returned from accept().
 596                  */
 597                 inet_csk_reqsk_queue_drop(sk, req, prev);
 598                 goto out;
 599
 600         case TCP_SYN_SENT:
 601         case TCP_SYN_RECV:  /* Cannot happen.
 602                                It can f.e. if SYNs crossed.
 603                              */
 604                 if (!sock_owned_by_user(sk)) {
 605                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 606                         sk->sk_err = err;
 607
 608                         sk->sk_error_report(sk);
 609
 610                         tcp_done(sk);
 611                 } else {
 612                         sk->sk_err_soft = err;
 613                 }
 614                 goto out;
 615         }
 616
 617         /* If we've already connected we will keep trying
 618          * until we time out, or the user gives up.
 619          *
 620          * rfc1122 4.2.3.9 allows to consider as hard errors
 621          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 622          * but it is obsoleted by pmtu discovery).
 623          *
 624          * Note, that in modern internet, where routing is unreliable
 625          * and in each dark corner broken firewalls sit, sending random
 626          * errors ordered by their masters even this two messages finally lose
 627          * their original sense (even Linux sends invalid PORT_UNREACHs)
 628          *
 629          * Now we are in compliance with RFCs.
 630          *                                                      --ANK (980905)
 631          */
 632
 633         inet = inet_sk(sk);
 634         if (!sock_owned_by_user(sk) && inet->recverr) {
 635                 sk->sk_err = err;
 636                 sk->sk_error_report(sk);
 637         } else  { /* Only an error on timeout */
 638                 sk->sk_err_soft = err;
 639         }
 640
 641 out:
 642         bh_unlock_sock(sk);
 643         sock_put(sk);
 644 }
 645
 646 /* This routine computes an IPv4 TCP checksum. */
 647 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 648                        struct sk_buff *skb)
 649 {
 650         struct inet_sock *inet = inet_sk(sk);
 651
 652         if (skb->ip_summed == CHECKSUM_HW) {
 653                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 654                 skb->csum = offsetof(struct tcphdr, check);
 655         } else {
 656                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 657                                          csum_partial((char *)th,
 658                                                       th->doff << 2,
 659                                                       skb->csum));
 660         }
 661 }
 662
 663 /*
 664  *      This routine will send an RST to the other tcp.
 665  *
 666  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 667  *                    for reset.
 668  *      Answer: if a packet caused RST, it is not for a socket
 669  *              existing in our system, if it is matched to a socket,
 670  *              it is just duplicate segment or bug in other side's TCP.
 671  *              So that we build reply only basing on parameters
 672  *              arrived with segment.
 673  *      Exception: precedence violation. We do not implement it in any case.
 674  */
 675
 676 static void tcp_v4_send_reset(struct sk_buff *skb)
 677 {
 678         struct tcphdr *th = skb->h.th;
 679         struct tcphdr rth;
 680         struct ip_reply_arg arg;
 681
 682         /* Never send a reset in response to a reset. */
 683         if (th->rst)
 684                 return;
 685
 686         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rth, 0, sizeof(struct tcphdr));
 691         rth.dest   = th->source;
 692         rth.source = th->dest;
 693         rth.doff   = sizeof(struct tcphdr) / 4;
 694         rth.rst    = 1;
 695
 696         if (th->ack) {
 697                 rth.seq = th->ack_seq;
 698         } else {
 699                 rth.ack = 1;
 700                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                     skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof arg);
 705         arg.iov[0].iov_base = (unsigned char *)&rth;
 706         arg.iov[0].iov_len  = sizeof rth;
 707         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 708                                       skb->nh.iph->saddr, /*XXX*/
 709                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 710         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 711
 712         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 713
 714         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 715         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 716 }
 717
 718 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 719    outside socket context is ugly, certainly. What can I do?
 720  */
 721
 722 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 723                             u32 win, u32 ts)
 724 {
 725         struct tcphdr *th = skb->h.th;
 726         struct {
 727                 struct tcphdr th;
 728                 u32 tsopt[3];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof arg);
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (ts) {
 738                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                      (TCPOPT_TIMESTAMP << 8) |
 740                                      TCPOLEN_TIMESTAMP);
 741                 rep.tsopt[1] = htonl(tcp_time_stamp);
 742                 rep.tsopt[2] = htonl(ts);
 743                 arg.iov[0].iov_len = sizeof(rep);
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 756                                       skb->nh.iph->saddr, /*XXX*/
 757                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 758         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 759
 760         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 761
 762         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 763 }
 764
 765 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 766 {
 767         struct inet_timewait_sock *tw = inet_twsk(sk);
 768         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 769
 770         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 771                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 772
 773         inet_twsk_put(tw);
 774 }
 775
 776 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 777 {
 778         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 779                         req->ts_recent);
 780 }
 781
 782 /*
 783  *      Send a SYN-ACK after having received an ACK.
 784  *      This still operates on a request_sock only, not on a big
 785  *      socket.
 786  */
 787 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 788                               struct dst_entry *dst)
 789 {
 790         const struct inet_request_sock *ireq = inet_rsk(req);
 791         int err = -1;
 792         struct sk_buff * skb;
 793
 794         /* First, grab a route. */
 795         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 796                 goto out;
 797
 798         skb = tcp_make_synack(sk, dst, req);
 799
 800         if (skb) {
 801                 struct tcphdr *th = skb->h.th;
 802
 803                 th->check = tcp_v4_check(th, skb->len,
 804                                          ireq->loc_addr,
 805                                          ireq->rmt_addr,
 806                                          csum_partial((char *)th, skb->len,
 807                                                       skb->csum));
 808
 809                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 810                                             ireq->rmt_addr,
 811                                             ireq->opt);
 812                 if (err == NET_XMIT_CN)
 813                         err = 0;
 814         }
 815
 816 out:
 817         dst_release(dst);
 818         return err;
 819 }
 820
 821 /*
 822  *      IPv4 request_sock destructor.
 823  */
 824 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 825 {
 826         kfree(inet_rsk(req)->opt);
 827 }
 828
 829 static inline void syn_flood_warning(struct sk_buff *skb)
 830 {
 831         static unsigned long warntime;
 832
 833         if (time_after(jiffies, (warntime + HZ * 60))) {
 834                 warntime = jiffies;
 835                 printk(KERN_INFO
 836                        "possible SYN flooding on port %d. Sending cookies.\n",
 837                        ntohs(skb->h.th->dest));
 838         }
 839 }
 840
 841 /*
 842  * Save and compile IPv4 options into the request_sock if needed.
 843  */
 844 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 845                                                      struct sk_buff *skb)
 846 {
 847         struct ip_options *opt = &(IPCB(skb)->opt);
 848         struct ip_options *dopt = NULL;
 849
 850         if (opt && opt->optlen) {
 851                 int opt_size = optlength(opt);
 852                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 853                 if (dopt) {
 854                         if (ip_options_echo(dopt, skb)) {
 855                                 kfree(dopt);
 856                                 dopt = NULL;
 857                         }
 858                 }
 859         }
 860         return dopt;
 861 }
 862
 863 struct request_sock_ops tcp_request_sock_ops = {
 864         .family         =       PF_INET,
 865         .obj_size       =       sizeof(struct tcp_request_sock),
 866         .rtx_syn_ack    =       tcp_v4_send_synack,
 867         .send_ack       =       tcp_v4_reqsk_send_ack,
 868         .destructor     =       tcp_v4_reqsk_destructor,
 869         .send_reset     =       tcp_v4_send_reset,
 870 };
 871
 872 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 873 {
 874         struct inet_request_sock *ireq;
 875         struct tcp_options_received tmp_opt;
 876         struct request_sock *req;
 877         __u32 saddr = skb->nh.iph->saddr;
 878         __u32 daddr = skb->nh.iph->daddr;
 879         __u32 isn = TCP_SKB_CB(skb)->when;
 880         struct dst_entry *dst = NULL;
 881 #ifdef CONFIG_SYN_COOKIES
 882         int want_cookie = 0;
 883 #else
 884 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 885 #endif
 886
 887         /* Never answer to SYNs send to broadcast or multicast */
 888         if (((struct rtable *)skb->dst)->rt_flags &
 889             (RTCF_BROADCAST | RTCF_MULTICAST))
 890                 goto drop;
 891
 892         /* TW buckets are converted to open requests without
 893          * limitations, they conserve resources and peer is
 894          * evidently real one.
 895          */
 896         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 897 #ifdef CONFIG_SYN_COOKIES
 898                 if (sysctl_tcp_syncookies) {
 899                         want_cookie = 1;
 900                 } else
 901 #endif
 902                 goto drop;
 903         }
 904
 905         /* Accept backlog is full. If we have already queued enough
 906          * of warm entries in syn queue, drop request. It is better than
 907          * clogging syn queue with openreqs with exponentially increasing
 908          * timeout.
 909          */
 910         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 911                 goto drop;
 912
 913         req = reqsk_alloc(&tcp_request_sock_ops);
 914         if (!req)
 915                 goto drop;
 916
 917         tcp_clear_options(&tmp_opt);
 918         tmp_opt.mss_clamp = 536;
 919         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 920
 921         tcp_parse_options(skb, &tmp_opt, 0);
 922
 923         if (want_cookie) {
 924                 tcp_clear_options(&tmp_opt);
 925                 tmp_opt.saw_tstamp = 0;
 926         }
 927
 928         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 929                 /* Some OSes (unknown ones, but I see them on web server, which
 930                  * contains information interesting only for windows'
 931                  * users) do not send their stamp in SYN. It is easy case.
 932                  * We simply do not advertise TS support.
 933                  */
 934                 tmp_opt.saw_tstamp = 0;
 935                 tmp_opt.tstamp_ok  = 0;
 936         }
 937         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 938
 939         tcp_openreq_init(req, &tmp_opt, skb);
 940
 941         ireq = inet_rsk(req);
 942         ireq->loc_addr = daddr;
 943         ireq->rmt_addr = saddr;
 944         ireq->opt = tcp_v4_save_options(sk, skb);
 945         if (!want_cookie)
 946                 TCP_ECN_create_request(req, skb->h.th);
 947
 948         if (want_cookie) {
 949 #ifdef CONFIG_SYN_COOKIES
 950                 syn_flood_warning(skb);
 951 #endif
 952                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 953         } else if (!isn) {
 954                 struct inet_peer *peer = NULL;
 955
 956                 /* VJ's idea. We save last timestamp seen
 957                  * from the destination in peer table, when entering
 958                  * state TIME-WAIT, and check against it before
 959                  * accepting new connection request.
 960                  *
 961                  * If "isn" is not zero, this request hit alive
 962                  * timewait bucket, so that all the necessary checks
 963                  * are made in the function processing timewait state.
 964                  */
 965                 if (tmp_opt.saw_tstamp &&
 966                     tcp_death_row.sysctl_tw_recycle &&
 967                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 968                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 969                     peer->v4daddr == saddr) {
 970                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 971                             (s32)(peer->tcp_ts - req->ts_recent) >
 972                                                         TCP_PAWS_WINDOW) {
 973                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 974                                 dst_release(dst);
 975                                 goto drop_and_free;
 976                         }
 977                 }
 978                 /* Kill the following clause, if you dislike this way. */
 979                 else if (!sysctl_tcp_syncookies &&
 980                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 981                           (sysctl_max_syn_backlog >> 2)) &&
 982                          (!peer || !peer->tcp_ts_stamp) &&
 983                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 984                         /* Without syncookies last quarter of
 985                          * backlog is filled with destinations,
 986                          * proven to be alive.
 987                          * It means that we continue to communicate
 988                          * to destinations, already remembered
 989                          * to the moment of synflood.
 990                          */
 991                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
 992                                        "request from %u.%u.%u.%u/%u\n",
 993                                        NIPQUAD(saddr),
 994                                        ntohs(skb->h.th->source));
 995                         dst_release(dst);
 996                         goto drop_and_free;
 997                 }
 998
 999                 isn = tcp_v4_init_sequence(sk, skb);
1000         }
1001         tcp_rsk(req)->snt_isn = isn;
1002
1003         if (tcp_v4_send_synack(sk, req, dst))
1004                 goto drop_and_free;
1005
1006         if (want_cookie) {
1007                 reqsk_free(req);
1008         } else {
1009                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1010         }
1011         return 0;
1012
1013 drop_and_free:
1014         reqsk_free(req);
1015 drop:
1016         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1017         return 0;
1018 }
1019
1020
1021 /*
1022  * The three way handshake has completed - we got a valid synack -
1023  * now create the new socket.
1024  */
1025 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1026                                   struct request_sock *req,
1027                                   struct dst_entry *dst)
1028 {
1029         struct inet_request_sock *ireq;
1030         struct inet_sock *newinet;
1031         struct tcp_sock *newtp;
1032         struct sock *newsk;
1033
1034         if (sk_acceptq_is_full(sk))
1035                 goto exit_overflow;
1036
1037         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1038                 goto exit;
1039
1040         newsk = tcp_create_openreq_child(sk, req, skb);
1041         if (!newsk)
1042                 goto exit;
1043
1044         sk_setup_caps(newsk, dst);
1045
1046         newtp                 = tcp_sk(newsk);
1047         newinet               = inet_sk(newsk);
1048         ireq                  = inet_rsk(req);
1049         newinet->daddr        = ireq->rmt_addr;
1050         newinet->rcv_saddr    = ireq->loc_addr;
1051         newinet->saddr        = ireq->loc_addr;
1052         newinet->opt          = ireq->opt;
1053         ireq->opt             = NULL;
1054         newinet->mc_index     = inet_iif(skb);
1055         newinet->mc_ttl       = skb->nh.iph->ttl;
1056         newtp->ext_header_len = 0;
1057         if (newinet->opt)
1058                 newtp->ext_header_len = newinet->opt->optlen;
1059         newinet->id = newtp->write_seq ^ jiffies;
1060
1061         tcp_sync_mss(newsk, dst_mtu(dst));
1062         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1063         tcp_initialize_rcv_mss(newsk);
1064
1065         __inet_hash(&tcp_hashinfo, newsk, 0);
1066         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1067
1068         return newsk;
1069
1070 exit_overflow:
1071         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1072 exit:
1073         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1074         dst_release(dst);
1075         return NULL;
1076 }
1077
1078 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1079 {
1080         struct tcphdr *th = skb->h.th;
1081         struct iphdr *iph = skb->nh.iph;
1082         struct sock *nsk;
1083         struct request_sock **prev;
1084         /* Find possible connection requests. */
1085         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1086                                                        iph->saddr, iph->daddr);
1087         if (req)
1088                 return tcp_check_req(sk, skb, req, prev);
1089
1090         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1091                                         th->source, skb->nh.iph->daddr,
1092                                         ntohs(th->dest), inet_iif(skb));
1093
1094         if (nsk) {
1095                 if (nsk->sk_state != TCP_TIME_WAIT) {
1096                         bh_lock_sock(nsk);
1097                         return nsk;
1098                 }
1099                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1100                 return NULL;
1101         }
1102
1103 #ifdef CONFIG_SYN_COOKIES
1104         if (!th->rst && !th->syn && th->ack)
1105                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1106 #endif
1107         return sk;
1108 }
1109
1110 static int tcp_v4_checksum_init(struct sk_buff *skb)
1111 {
1112         if (skb->ip_summed == CHECKSUM_HW) {
1113                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1114                                   skb->nh.iph->daddr, skb->csum)) {
1115                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1116                         return 0;
1117                 }
1118         }
1119
1120         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1121                                        skb->len, IPPROTO_TCP, 0);
1122
1123         if (skb->len <= 76) {
1124                 return __skb_checksum_complete(skb);
1125         }
1126         return 0;
1127 }
1128
1129
1130 /* The socket must have it's spinlock held when we get
1131  * here.
1132  *
1133  * We have a potential double-lock case here, so even when
1134  * doing backlog processing we use the BH locking scheme.
1135  * This is because we cannot sleep with the original spinlock
1136  * held.
1137  */
1138 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1139 {
1140         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1141                 TCP_CHECK_TIMER(sk);
1142                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1143                         goto reset;
1144                 TCP_CHECK_TIMER(sk);
1145                 return 0;
1146         }
1147
1148         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1149                 goto csum_err;
1150
1151         if (sk->sk_state == TCP_LISTEN) {
1152                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1153                 if (!nsk)
1154                         goto discard;
1155
1156                 if (nsk != sk) {
1157                         if (tcp_child_process(sk, nsk, skb))
1158                                 goto reset;
1159                         return 0;
1160                 }
1161         }
1162
1163         TCP_CHECK_TIMER(sk);
1164         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1165                 goto reset;
1166         TCP_CHECK_TIMER(sk);
1167         return 0;
1168
1169 reset:
1170         tcp_v4_send_reset(skb);
1171 discard:
1172         kfree_skb(skb);
1173         /* Be careful here. If this function gets more complicated and
1174          * gcc suffers from register pressure on the x86, sk (in %ebx)
1175          * might be destroyed here. This current version compiles correctly,
1176          * but you have been warned.
1177          */
1178         return 0;
1179
1180 csum_err:
1181         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1182         goto discard;
1183 }
1184
1185 /*
1186  *      From tcp_input.c
1187  */
1188
1189 int tcp_v4_rcv(struct sk_buff *skb)
1190 {
1191         struct tcphdr *th;
1192         struct sock *sk;
1193         int ret;
1194
1195         if (skb->pkt_type != PACKET_HOST)
1196                 goto discard_it;
1197
1198         /* Count it even if it's bad */
1199         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1200
1201         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1202                 goto discard_it;
1203
1204         th = skb->h.th;
1205
1206         if (th->doff < sizeof(struct tcphdr) / 4)
1207                 goto bad_packet;
1208         if (!pskb_may_pull(skb, th->doff * 4))
1209                 goto discard_it;
1210
1211         /* An explanation is required here, I think.
1212          * Packet length and doff are validated by header prediction,
1213          * provided case of th->doff==0 is eliminated.
1214          * So, we defer the checks. */
1215         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1216              tcp_v4_checksum_init(skb)))
1217                 goto bad_packet;
1218
1219         th = skb->h.th;
1220         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1221         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1222                                     skb->len - th->doff * 4);
1223         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1224         TCP_SKB_CB(skb)->when    = 0;
1225         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1226         TCP_SKB_CB(skb)->sacked  = 0;
1227
1228         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1229                            skb->nh.iph->daddr, ntohs(th->dest),
1230                            inet_iif(skb));
1231
1232         if (!sk)
1233                 goto no_tcp_socket;
1234
1235 process:
1236         if (sk->sk_state == TCP_TIME_WAIT)
1237                 goto do_time_wait;
1238
1239         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1240                 goto discard_and_relse;
1241
1242         if (sk_filter(sk, skb, 0))
1243                 goto discard_and_relse;
1244
1245         skb->dev = NULL;
1246
1247         bh_lock_sock(sk);
1248         ret = 0;
1249         if (!sock_owned_by_user(sk)) {
1250                 if (!tcp_prequeue(sk, skb))
1251                         ret = tcp_v4_do_rcv(sk, skb);
1252         } else
1253                 sk_add_backlog(sk, skb);
1254         bh_unlock_sock(sk);
1255
1256         sock_put(sk);
1257
1258         return ret;
1259
1260 no_tcp_socket:
1261         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1262                 goto discard_it;
1263
1264         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1265 bad_packet:
1266                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1267         } else {
1268                 tcp_v4_send_reset(skb);
1269         }
1270
1271 discard_it:
1272         /* Discard frame. */
1273         kfree_skb(skb);
1274         return 0;
1275
1276 discard_and_relse:
1277         sock_put(sk);
1278         goto discard_it;
1279
1280 do_time_wait:
1281         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1282                 inet_twsk_put((struct inet_timewait_sock *) sk);
1283                 goto discard_it;
1284         }
1285
1286         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1287                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1288                 inet_twsk_put((struct inet_timewait_sock *) sk);
1289                 goto discard_it;
1290         }
1291         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1292                                            skb, th)) {
1293         case TCP_TW_SYN: {
1294                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1295                                                         skb->nh.iph->daddr,
1296                                                         ntohs(th->dest),
1297                                                         inet_iif(skb));
1298                 if (sk2) {
1299                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1300                                              &tcp_death_row);
1301                         inet_twsk_put((struct inet_timewait_sock *)sk);
1302                         sk = sk2;
1303                         goto process;
1304                 }
1305                 /* Fall through to ACK */
1306         }
1307         case TCP_TW_ACK:
1308                 tcp_v4_timewait_ack(sk, skb);
1309                 break;
1310         case TCP_TW_RST:
1311                 goto no_tcp_socket;
1312         case TCP_TW_SUCCESS:;
1313         }
1314         goto discard_it;
1315 }
1316
1317 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318 {
1319         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320         struct inet_sock *inet = inet_sk(sk);
1321
1322         sin->sin_family         = AF_INET;
1323         sin->sin_addr.s_addr    = inet->daddr;
1324         sin->sin_port           = inet->dport;
1325 }
1326
1327 /* VJ's idea. Save last timestamp seen from this destination
1328  * and hold it at least for normal timewait interval to use for duplicate
1329  * segment detection in subsequent connections, before they enter synchronized
1330  * state.
1331  */
1332
1333 int tcp_v4_remember_stamp(struct sock *sk)
1334 {
1335         struct inet_sock *inet = inet_sk(sk);
1336         struct tcp_sock *tp = tcp_sk(sk);
1337         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1338         struct inet_peer *peer = NULL;
1339         int release_it = 0;
1340
1341         if (!rt || rt->rt_dst != inet->daddr) {
1342                 peer = inet_getpeer(inet->daddr, 1);
1343                 release_it = 1;
1344         } else {
1345                 if (!rt->peer)
1346                         rt_bind_peer(rt, 1);
1347                 peer = rt->peer;
1348         }
1349
1350         if (peer) {
1351                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1352                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1353                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1354                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1355                         peer->tcp_ts = tp->rx_opt.ts_recent;
1356                 }
1357                 if (release_it)
1358                         inet_putpeer(peer);
1359                 return 1;
1360         }
1361
1362         return 0;
1363 }
1364
1365 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1366 {
1367         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1368
1369         if (peer) {
1370                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1371
1372                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1373                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1374                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1375                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1376                         peer->tcp_ts       = tcptw->tw_ts_recent;
1377                 }
1378                 inet_putpeer(peer);
1379                 return 1;
1380         }
1381
1382         return 0;
1383 }
1384
1385 struct tcp_func ipv4_specific = {
1386         .queue_xmit     =       ip_queue_xmit,
1387         .send_check     =       tcp_v4_send_check,
1388         .rebuild_header =       inet_sk_rebuild_header,
1389         .conn_request   =       tcp_v4_conn_request,
1390         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1391         .remember_stamp =       tcp_v4_remember_stamp,
1392         .net_header_len =       sizeof(struct iphdr),
1393         .setsockopt     =       ip_setsockopt,
1394         .getsockopt     =       ip_getsockopt,
1395         .addr2sockaddr  =       v4_addr2sockaddr,
1396         .sockaddr_len   =       sizeof(struct sockaddr_in),
1397 };
1398
1399 /* NOTE: A lot of things set to zero explicitly by call to
1400  *       sk_alloc() so need not be done here.
1401  */
1402 static int tcp_v4_init_sock(struct sock *sk)
1403 {
1404         struct inet_connection_sock *icsk = inet_csk(sk);
1405         struct tcp_sock *tp = tcp_sk(sk);
1406
1407         skb_queue_head_init(&tp->out_of_order_queue);
1408         tcp_init_xmit_timers(sk);
1409         tcp_prequeue_init(tp);
1410
1411         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1412         tp->mdev = TCP_TIMEOUT_INIT;
1413
1414         /* So many TCP implementations out there (incorrectly) count the
1415          * initial SYN frame in their delayed-ACK and congestion control
1416          * algorithms that we must have the following bandaid to talk
1417          * efficiently to them.  -DaveM
1418          */
1419         tp->snd_cwnd = 2;
1420
1421         /* See draft-stevens-tcpca-spec-01 for discussion of the
1422          * initialization of these values.
1423          */
1424         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1425         tp->snd_cwnd_clamp = ~0;
1426         tp->mss_cache = 536;
1427
1428         tp->reordering = sysctl_tcp_reordering;
1429         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1430
1431         sk->sk_state = TCP_CLOSE;
1432
1433         sk->sk_write_space = sk_stream_write_space;
1434         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435
1436         tp->af_specific = &ipv4_specific;
1437
1438         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1440
1441         atomic_inc(&tcp_sockets_allocated);
1442
1443         return 0;
1444 }
1445
1446 int tcp_v4_destroy_sock(struct sock *sk)
1447 {
1448         struct tcp_sock *tp = tcp_sk(sk);
1449
1450         tcp_clear_xmit_timers(sk);
1451
1452         tcp_cleanup_congestion_control(sk);
1453
1454         /* Cleanup up the write buffer. */
1455         sk_stream_writequeue_purge(sk);
1456
1457         /* Cleans up our, hopefully empty, out_of_order_queue. */
1458         __skb_queue_purge(&tp->out_of_order_queue);
1459
1460         /* Clean prequeue, it must be empty really */
1461         __skb_queue_purge(&tp->ucopy.prequeue);
1462
1463         /* Clean up a referenced TCP bind bucket. */
1464         if (inet_csk(sk)->icsk_bind_hash)
1465                 inet_put_port(&tcp_hashinfo, sk);
1466
1467         /*
1468          * If sendmsg cached page exists, toss it.
1469          */
1470         if (sk->sk_sndmsg_page) {
1471                 __free_page(sk->sk_sndmsg_page);
1472                 sk->sk_sndmsg_page = NULL;
1473         }
1474
1475         atomic_dec(&tcp_sockets_allocated);
1476
1477         return 0;
1478 }
1479
1480 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1481
1482 #ifdef CONFIG_PROC_FS
1483 /* Proc filesystem TCP sock list dumping. */
1484
1485 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1486 {
1487         return hlist_empty(head) ? NULL :
1488                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1489 }
1490
1491 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1492 {
1493         return tw->tw_node.next ?
1494                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1495 }
1496
1497 static void *listening_get_next(struct seq_file *seq, void *cur)
1498 {
1499         struct inet_connection_sock *icsk;
1500         struct hlist_node *node;
1501         struct sock *sk = cur;
1502         struct tcp_iter_state* st = seq->private;
1503
1504         if (!sk) {
1505                 st->bucket = 0;
1506                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1507                 goto get_sk;
1508         }
1509
1510         ++st->num;
1511
1512         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1513                 struct request_sock *req = cur;
1514
1515                 icsk = inet_csk(st->syn_wait_sk);
1516                 req = req->dl_next;
1517                 while (1) {
1518                         while (req) {
1519                                 if (req->rsk_ops->family == st->family) {
1520                                         cur = req;
1521                                         goto out;
1522                                 }
1523                                 req = req->dl_next;
1524                         }
1525                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1526                                 break;
1527 get_req:
1528                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1529                 }
1530                 sk        = sk_next(st->syn_wait_sk);
1531                 st->state = TCP_SEQ_STATE_LISTENING;
1532                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1533         } else {
1534                 icsk = inet_csk(sk);
1535                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1536                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1537                         goto start_req;
1538                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1539                 sk = sk_next(sk);
1540         }
1541 get_sk:
1542         sk_for_each_from(sk, node) {
1543                 if (sk->sk_family == st->family) {
1544                         cur = sk;
1545                         goto out;
1546                 }
1547                 icsk = inet_csk(sk);
1548                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1549                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1550 start_req:
1551                         st->uid         = sock_i_uid(sk);
1552                         st->syn_wait_sk = sk;
1553                         st->state       = TCP_SEQ_STATE_OPENREQ;
1554                         st->sbucket     = 0;
1555                         goto get_req;
1556                 }
1557                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1558         }
1559         if (++st->bucket < INET_LHTABLE_SIZE) {
1560                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1561                 goto get_sk;
1562         }
1563         cur = NULL;
1564 out:
1565         return cur;
1566 }
1567
1568 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1569 {
1570         void *rc = listening_get_next(seq, NULL);
1571
1572         while (rc && *pos) {
1573                 rc = listening_get_next(seq, rc);
1574                 --*pos;
1575         }
1576         return rc;
1577 }
1578
1579 static void *established_get_first(struct seq_file *seq)
1580 {
1581         struct tcp_iter_state* st = seq->private;
1582         void *rc = NULL;
1583
1584         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1585                 struct sock *sk;
1586                 struct hlist_node *node;
1587                 struct inet_timewait_sock *tw;
1588
1589                 /* We can reschedule _before_ having picked the target: */
1590                 cond_resched_softirq();
1591
1592                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1593                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1594                         if (sk->sk_family != st->family) {
1595                                 continue;
1596                         }
1597                         rc = sk;
1598                         goto out;
1599                 }
1600                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1601                 inet_twsk_for_each(tw, node,
1602                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1603                         if (tw->tw_family != st->family) {
1604                                 continue;
1605                         }
1606                         rc = tw;
1607                         goto out;
1608                 }
1609                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1610                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1611         }
1612 out:
1613         return rc;
1614 }
1615
1616 static void *established_get_next(struct seq_file *seq, void *cur)
1617 {
1618         struct sock *sk = cur;
1619         struct inet_timewait_sock *tw;
1620         struct hlist_node *node;
1621         struct tcp_iter_state* st = seq->private;
1622
1623         ++st->num;
1624
1625         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1626                 tw = cur;
1627                 tw = tw_next(tw);
1628 get_tw:
1629                 while (tw && tw->tw_family != st->family) {
1630                         tw = tw_next(tw);
1631                 }
1632                 if (tw) {
1633                         cur = tw;
1634                         goto out;
1635                 }
1636                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1637                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1638
1639                 /* We can reschedule between buckets: */
1640                 cond_resched_softirq();
1641
1642                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1643                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1644                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1645                 } else {
1646                         cur = NULL;
1647                         goto out;
1648                 }
1649         } else
1650                 sk = sk_next(sk);
1651
1652         sk_for_each_from(sk, node) {
1653                 if (sk->sk_family == st->family)
1654                         goto found;
1655         }
1656
1657         st->state = TCP_SEQ_STATE_TIME_WAIT;
1658         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1659         goto get_tw;
1660 found:
1661         cur = sk;
1662 out:
1663         return cur;
1664 }
1665
1666 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1667 {
1668         void *rc = established_get_first(seq);
1669
1670         while (rc && pos) {
1671                 rc = established_get_next(seq, rc);
1672                 --pos;
1673         }
1674         return rc;
1675 }
1676
1677 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1678 {
1679         void *rc;
1680         struct tcp_iter_state* st = seq->private;
1681
1682         inet_listen_lock(&tcp_hashinfo);
1683         st->state = TCP_SEQ_STATE_LISTENING;
1684         rc        = listening_get_idx(seq, &pos);
1685
1686         if (!rc) {
1687                 inet_listen_unlock(&tcp_hashinfo);
1688                 local_bh_disable();
1689                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1690                 rc        = established_get_idx(seq, pos);
1691         }
1692
1693         return rc;
1694 }
1695
1696 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1697 {
1698         struct tcp_iter_state* st = seq->private;
1699         st->state = TCP_SEQ_STATE_LISTENING;
1700         st->num = 0;
1701         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1702 }
1703
1704 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1705 {
1706         void *rc = NULL;
1707         struct tcp_iter_state* st;
1708
1709         if (v == SEQ_START_TOKEN) {
1710                 rc = tcp_get_idx(seq, 0);
1711                 goto out;
1712         }
1713         st = seq->private;
1714
1715         switch (st->state) {
1716         case TCP_SEQ_STATE_OPENREQ:
1717         case TCP_SEQ_STATE_LISTENING:
1718                 rc = listening_get_next(seq, v);
1719                 if (!rc) {
1720                         inet_listen_unlock(&tcp_hashinfo);
1721                         local_bh_disable();
1722                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1723                         rc        = established_get_first(seq);
1724                 }
1725                 break;
1726         case TCP_SEQ_STATE_ESTABLISHED:
1727         case TCP_SEQ_STATE_TIME_WAIT:
1728                 rc = established_get_next(seq, v);
1729                 break;
1730         }
1731 out:
1732         ++*pos;
1733         return rc;
1734 }
1735
1736 static void tcp_seq_stop(struct seq_file *seq, void *v)
1737 {
1738         struct tcp_iter_state* st = seq->private;
1739
1740         switch (st->state) {
1741         case TCP_SEQ_STATE_OPENREQ:
1742                 if (v) {
1743                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1744                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1745                 }
1746         case TCP_SEQ_STATE_LISTENING:
1747                 if (v != SEQ_START_TOKEN)
1748                         inet_listen_unlock(&tcp_hashinfo);
1749                 break;
1750         case TCP_SEQ_STATE_TIME_WAIT:
1751         case TCP_SEQ_STATE_ESTABLISHED:
1752                 if (v)
1753                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1754                 local_bh_enable();
1755                 break;
1756         }
1757 }
1758
1759 static int tcp_seq_open(struct inode *inode, struct file *file)
1760 {
1761         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1762         struct seq_file *seq;
1763         struct tcp_iter_state *s;
1764         int rc;
1765
1766         if (unlikely(afinfo == NULL))
1767                 return -EINVAL;
1768
1769         s = kmalloc(sizeof(*s), GFP_KERNEL);
1770         if (!s)
1771                 return -ENOMEM;
1772         memset(s, 0, sizeof(*s));
1773         s->family               = afinfo->family;
1774         s->seq_ops.start        = tcp_seq_start;
1775         s->seq_ops.next         = tcp_seq_next;
1776         s->seq_ops.show         = afinfo->seq_show;
1777         s->seq_ops.stop         = tcp_seq_stop;
1778
1779         rc = seq_open(file, &s->seq_ops);
1780         if (rc)
1781                 goto out_kfree;
1782         seq          = file->private_data;
1783         seq->private = s;
1784 out:
1785         return rc;
1786 out_kfree:
1787         kfree(s);
1788         goto out;
1789 }
1790
1791 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1792 {
1793         int rc = 0;
1794         struct proc_dir_entry *p;
1795
1796         if (!afinfo)
1797                 return -EINVAL;
1798         afinfo->seq_fops->owner         = afinfo->owner;
1799         afinfo->seq_fops->open          = tcp_seq_open;
1800         afinfo->seq_fops->read          = seq_read;
1801         afinfo->seq_fops->llseek        = seq_lseek;
1802         afinfo->seq_fops->release       = seq_release_private;
1803
1804         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1805         if (p)
1806                 p->data = afinfo;
1807         else
1808                 rc = -ENOMEM;
1809         return rc;
1810 }
1811
1812 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1813 {
1814         if (!afinfo)
1815                 return;
1816         proc_net_remove(afinfo->name);
1817         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1818 }
1819
1820 static void get_openreq4(struct sock *sk, struct request_sock *req,
1821                          char *tmpbuf, int i, int uid)
1822 {
1823         const struct inet_request_sock *ireq = inet_rsk(req);
1824         int ttd = req->expires - jiffies;
1825
1826         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1827                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1828                 i,
1829                 ireq->loc_addr,
1830                 ntohs(inet_sk(sk)->sport),
1831                 ireq->rmt_addr,
1832                 ntohs(ireq->rmt_port),
1833                 TCP_SYN_RECV,
1834                 0, 0, /* could print option size, but that is af dependent. */
1835                 1,    /* timers active (only the expire timer) */
1836                 jiffies_to_clock_t(ttd),
1837                 req->retrans,
1838                 uid,
1839                 0,  /* non standard timer */
1840                 0, /* open_requests have no inode */
1841                 atomic_read(&sk->sk_refcnt),
1842                 req);
1843 }
1844
1845 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1846 {
1847         int timer_active;
1848         unsigned long timer_expires;
1849         struct tcp_sock *tp = tcp_sk(sp);
1850         const struct inet_connection_sock *icsk = inet_csk(sp);
1851         struct inet_sock *inet = inet_sk(sp);
1852         unsigned int dest = inet->daddr;
1853         unsigned int src = inet->rcv_saddr;
1854         __u16 destp = ntohs(inet->dport);
1855         __u16 srcp = ntohs(inet->sport);
1856
1857         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1858                 timer_active    = 1;
1859                 timer_expires   = icsk->icsk_timeout;
1860         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1861                 timer_active    = 4;
1862                 timer_expires   = icsk->icsk_timeout;
1863         } else if (timer_pending(&sp->sk_timer)) {
1864                 timer_active    = 2;
1865                 timer_expires   = sp->sk_timer.expires;
1866         } else {
1867                 timer_active    = 0;
1868                 timer_expires = jiffies;
1869         }
1870
1871         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1872                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1873                 i, src, srcp, dest, destp, sp->sk_state,
1874                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1875                 timer_active,
1876                 jiffies_to_clock_t(timer_expires - jiffies),
1877                 icsk->icsk_retransmits,
1878                 sock_i_uid(sp),
1879                 icsk->icsk_probes_out,
1880                 sock_i_ino(sp),
1881                 atomic_read(&sp->sk_refcnt), sp,
1882                 icsk->icsk_rto,
1883                 icsk->icsk_ack.ato,
1884                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1885                 tp->snd_cwnd,
1886                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1887 }
1888
1889 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1890 {
1891         unsigned int dest, src;
1892         __u16 destp, srcp;
1893         int ttd = tw->tw_ttd - jiffies;
1894
1895         if (ttd < 0)
1896                 ttd = 0;
1897
1898         dest  = tw->tw_daddr;
1899         src   = tw->tw_rcv_saddr;
1900         destp = ntohs(tw->tw_dport);
1901         srcp  = ntohs(tw->tw_sport);
1902
1903         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1904                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1905                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1906                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1907                 atomic_read(&tw->tw_refcnt), tw);
1908 }
1909
1910 #define TMPSZ 150
1911
1912 static int tcp4_seq_show(struct seq_file *seq, void *v)
1913 {
1914         struct tcp_iter_state* st;
1915         char tmpbuf[TMPSZ + 1];
1916
1917         if (v == SEQ_START_TOKEN) {
1918                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1919                            "  sl  local_address rem_address   st tx_queue "
1920                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1921                            "inode");
1922                 goto out;
1923         }
1924         st = seq->private;
1925
1926         switch (st->state) {
1927         case TCP_SEQ_STATE_LISTENING:
1928         case TCP_SEQ_STATE_ESTABLISHED:
1929                 get_tcp4_sock(v, tmpbuf, st->num);
1930                 break;
1931         case TCP_SEQ_STATE_OPENREQ:
1932                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1933                 break;
1934         case TCP_SEQ_STATE_TIME_WAIT:
1935                 get_timewait4_sock(v, tmpbuf, st->num);
1936                 break;
1937         }
1938         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1939 out:
1940         return 0;
1941 }
1942
1943 static struct file_operations tcp4_seq_fops;
1944 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1945         .owner          = THIS_MODULE,
1946         .name           = "tcp",
1947         .family         = AF_INET,
1948         .seq_show       = tcp4_seq_show,
1949         .seq_fops       = &tcp4_seq_fops,
1950 };
1951
1952 int __init tcp4_proc_init(void)
1953 {
1954         return tcp_proc_register(&tcp4_seq_afinfo);
1955 }
1956
1957 void tcp4_proc_exit(void)
1958 {
1959         tcp_proc_unregister(&tcp4_seq_afinfo);
1960 }
1961 #endif /* CONFIG_PROC_FS */
1962
1963 struct proto tcp_prot = {
1964         .name                   = "TCP",
1965         .owner                  = THIS_MODULE,
1966         .close                  = tcp_close,
1967         .connect                = tcp_v4_connect,
1968         .disconnect             = tcp_disconnect,
1969         .accept                 = inet_csk_accept,
1970         .ioctl                  = tcp_ioctl,
1971         .init                   = tcp_v4_init_sock,
1972         .destroy                = tcp_v4_destroy_sock,
1973         .shutdown               = tcp_shutdown,
1974         .setsockopt             = tcp_setsockopt,
1975         .getsockopt             = tcp_getsockopt,
1976         .sendmsg                = tcp_sendmsg,
1977         .recvmsg                = tcp_recvmsg,
1978         .backlog_rcv            = tcp_v4_do_rcv,
1979         .hash                   = tcp_v4_hash,
1980         .unhash                 = tcp_unhash,
1981         .get_port               = tcp_v4_get_port,
1982         .enter_memory_pressure  = tcp_enter_memory_pressure,
1983         .sockets_allocated      = &tcp_sockets_allocated,
1984         .orphan_count           = &tcp_orphan_count,
1985         .memory_allocated       = &tcp_memory_allocated,
1986         .memory_pressure        = &tcp_memory_pressure,
1987         .sysctl_mem             = sysctl_tcp_mem,
1988         .sysctl_wmem            = sysctl_tcp_wmem,
1989         .sysctl_rmem            = sysctl_tcp_rmem,
1990         .max_header             = MAX_TCP_HEADER,
1991         .obj_size               = sizeof(struct tcp_sock),
1992         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
1993         .rsk_prot               = &tcp_request_sock_ops,
1994 };
1995
1996
1997
1998 void __init tcp_v4_init(struct net_proto_family *ops)
1999 {
2000         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2001         if (err < 0)
2002                 panic("Failed to create the TCP control socket.\n");
2003         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2004         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2005
2006         /* Unhash it so that IP input processing does not even
2007          * see it, we do not wish this socket to see incoming
2008          * packets.
2009          */
2010         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2011 }
2012
2013 EXPORT_SYMBOL(ipv4_specific);
2014 EXPORT_SYMBOL(inet_bind_bucket_create);
2015 EXPORT_SYMBOL(tcp_hashinfo);
2016 EXPORT_SYMBOL(tcp_prot);
2017 EXPORT_SYMBOL(tcp_unhash);
2018 EXPORT_SYMBOL(tcp_v4_conn_request);
2019 EXPORT_SYMBOL(tcp_v4_connect);
2020 EXPORT_SYMBOL(tcp_v4_do_rcv);
2021 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2022 EXPORT_SYMBOL(tcp_v4_send_check);
2023 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2024
2025 #ifdef CONFIG_PROC_FS
2026 EXPORT_SYMBOL(tcp_proc_register);
2027 EXPORT_SYMBOL(tcp_proc_unregister);
2028 #endif
2029 EXPORT_SYMBOL(sysctl_local_port_range);
2030 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2031 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2032