]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - net/ipv4/tcp_output.c
tcp: back retransmit_high when it over-estimated
[linux-2.6-omap-h63xx.git] / net / ipv4 / tcp_output.c
index ad993ecb4810d88dc49bca51208a96e67c9555bb..8f9793a37b618540d27ec965b5489a3d1ff7a171 100644 (file)
@@ -5,8 +5,6 @@
  *
  *             Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:    $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
- *
  * Authors:    Ross Biro
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             Mark Evans, <evansmp@uhura.aston.ac.uk>
@@ -347,28 +345,82 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
        TCP_SKB_CB(skb)->end_seq = seq;
 }
 
-static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
-                                        __u32 tstamp, __u8 **md5_hash)
-{
-       if (tp->rx_opt.tstamp_ok) {
+#define OPTION_SACK_ADVERTISE  (1 << 0)
+#define OPTION_TS              (1 << 1)
+#define OPTION_MD5             (1 << 2)
+
+struct tcp_out_options {
+       u8 options;             /* bit field of OPTION_* */
+       u8 ws;                  /* window scale, 0 to disable */
+       u8 num_sack_blocks;     /* number of SACK blocks to include */
+       u16 mss;                /* 0 to disable */
+       __u32 tsval, tsecr;     /* need to include OPTION_TS */
+};
+
+static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+                             const struct tcp_out_options *opts,
+                             __u8 **md5_hash) {
+       if (unlikely(OPTION_MD5 & opts->options)) {
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_NOP << 16) |
+                              (TCPOPT_MD5SIG << 8) |
+                              TCPOLEN_MD5SIG);
+               *md5_hash = (__u8 *)ptr;
+               ptr += 4;
+       } else {
+               *md5_hash = NULL;
+       }
+
+       if (likely(OPTION_TS & opts->options)) {
+               if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
+                       *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+                                      (TCPOLEN_SACK_PERM << 16) |
+                                      (TCPOPT_TIMESTAMP << 8) |
+                                      TCPOLEN_TIMESTAMP);
+               } else {
+                       *ptr++ = htonl((TCPOPT_NOP << 24) |
+                                      (TCPOPT_NOP << 16) |
+                                      (TCPOPT_TIMESTAMP << 8) |
+                                      TCPOLEN_TIMESTAMP);
+               }
+               *ptr++ = htonl(opts->tsval);
+               *ptr++ = htonl(opts->tsecr);
+       }
+
+       if (unlikely(opts->mss)) {
+               *ptr++ = htonl((TCPOPT_MSS << 24) |
+                              (TCPOLEN_MSS << 16) |
+                              opts->mss);
+       }
+
+       if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
+                    !(OPTION_TS & opts->options))) {
                *ptr++ = htonl((TCPOPT_NOP << 24) |
                               (TCPOPT_NOP << 16) |
-                              (TCPOPT_TIMESTAMP << 8) |
-                              TCPOLEN_TIMESTAMP);
-               *ptr++ = htonl(tstamp);
-               *ptr++ = htonl(tp->rx_opt.ts_recent);
+                              (TCPOPT_SACK_PERM << 8) |
+                              TCPOLEN_SACK_PERM);
        }
-       if (tp->rx_opt.eff_sacks) {
-               struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
+
+       if (unlikely(opts->ws)) {
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_WINDOW << 16) |
+                              (TCPOLEN_WINDOW << 8) |
+                              opts->ws);
+       }
+
+       if (unlikely(opts->num_sack_blocks)) {
+               struct tcp_sack_block *sp = tp->rx_opt.dsack ?
+                       tp->duplicate_sack : tp->selective_acks;
                int this_sack;
 
                *ptr++ = htonl((TCPOPT_NOP  << 24) |
                               (TCPOPT_NOP  << 16) |
                               (TCPOPT_SACK <<  8) |
-                              (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+                              (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
                                                     TCPOLEN_SACK_PERBLOCK)));
 
-               for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
+               for (this_sack = 0; this_sack < opts->num_sack_blocks;
+                    ++this_sack) {
                        *ptr++ = htonl(sp[this_sack].start_seq);
                        *ptr++ = htonl(sp[this_sack].end_seq);
                }
@@ -378,81 +430,139 @@ static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
                        tp->rx_opt.eff_sacks--;
                }
        }
+}
+
+static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+                               struct tcp_out_options *opts,
+                               struct tcp_md5sig_key **md5) {
+       struct tcp_sock *tp = tcp_sk(sk);
+       unsigned size = 0;
+
 #ifdef CONFIG_TCP_MD5SIG
-       if (md5_hash) {
-               *ptr++ = htonl((TCPOPT_NOP << 24) |
-                              (TCPOPT_NOP << 16) |
-                              (TCPOPT_MD5SIG << 8) |
-                              TCPOLEN_MD5SIG);
-               *md5_hash = (__u8 *)ptr;
+       *md5 = tp->af_specific->md5_lookup(sk, sk);
+       if (*md5) {
+               opts->options |= OPTION_MD5;
+               size += TCPOLEN_MD5SIG_ALIGNED;
        }
+#else
+       *md5 = NULL;
 #endif
+
+       /* We always get an MSS option.  The option bytes which will be seen in
+        * normal data packets should timestamps be used, must be in the MSS
+        * advertised.  But we subtract them from tp->mss_cache so that
+        * calculations in tcp_sendmsg are simpler etc.  So account for this
+        * fact here if necessary.  If we don't do this correctly, as a
+        * receiver we won't recognize data packets as being full sized when we
+        * should, and thus we won't abide by the delayed ACK rules correctly.
+        * SACKs don't matter, we never delay an ACK when we have any of those
+        * going out.  */
+       opts->mss = tcp_advertise_mss(sk);
+       size += TCPOLEN_MSS_ALIGNED;
+
+       if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+               opts->options |= OPTION_TS;
+               opts->tsval = TCP_SKB_CB(skb)->when;
+               opts->tsecr = tp->rx_opt.ts_recent;
+               size += TCPOLEN_TSTAMP_ALIGNED;
+       }
+       if (likely(sysctl_tcp_window_scaling)) {
+               opts->ws = tp->rx_opt.rcv_wscale;
+               if(likely(opts->ws))
+                       size += TCPOLEN_WSCALE_ALIGNED;
+       }
+       if (likely(sysctl_tcp_sack)) {
+               opts->options |= OPTION_SACK_ADVERTISE;
+               if (unlikely(!(OPTION_TS & opts->options)))
+                       size += TCPOLEN_SACKPERM_ALIGNED;
+       }
+
+       return size;
 }
 
-/* Construct a tcp options header for a SYN or SYN_ACK packet.
- * If this is every changed make sure to change the definition of
- * MAX_SYN_SIZE to match the new maximum number of options that you
- * can generate.
- *
- * Note - that with the RFC2385 TCP option, we make room for the
- * 16 byte MD5 hash. This will be filled in later, so the pointer for the
- * location to be filled is passed back up.
- */
-static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
-                                 int offer_wscale, int wscale, __u32 tstamp,
-                                 __u32 ts_recent, __u8 **md5_hash)
-{
-       /* We always get an MSS option.
-        * The option bytes which will be seen in normal data
-        * packets should timestamps be used, must be in the MSS
-        * advertised.  But we subtract them from tp->mss_cache so
-        * that calculations in tcp_sendmsg are simpler etc.
-        * So account for this fact here if necessary.  If we
-        * don't do this correctly, as a receiver we won't
-        * recognize data packets as being full sized when we
-        * should, and thus we won't abide by the delayed ACK
-        * rules correctly.
-        * SACKs don't matter, we never delay an ACK when we
-        * have any of those going out.
-        */
-       *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
-       if (ts) {
-               if (sack)
-                       *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
-                                      (TCPOLEN_SACK_PERM << 16) |
-                                      (TCPOPT_TIMESTAMP << 8) |
-                                      TCPOLEN_TIMESTAMP);
-               else
-                       *ptr++ = htonl((TCPOPT_NOP << 24) |
-                                      (TCPOPT_NOP << 16) |
-                                      (TCPOPT_TIMESTAMP << 8) |
-                                      TCPOLEN_TIMESTAMP);
-               *ptr++ = htonl(tstamp);         /* TSVAL */
-               *ptr++ = htonl(ts_recent);      /* TSECR */
-       } else if (sack)
-               *ptr++ = htonl((TCPOPT_NOP << 24) |
-                              (TCPOPT_NOP << 16) |
-                              (TCPOPT_SACK_PERM << 8) |
-                              TCPOLEN_SACK_PERM);
-       if (offer_wscale)
-               *ptr++ = htonl((TCPOPT_NOP << 24) |
-                              (TCPOPT_WINDOW << 16) |
-                              (TCPOLEN_WINDOW << 8) |
-                              (wscale));
+static unsigned tcp_synack_options(struct sock *sk,
+                                  struct request_sock *req,
+                                  unsigned mss, struct sk_buff *skb,
+                                  struct tcp_out_options *opts,
+                                  struct tcp_md5sig_key **md5) {
+       unsigned size = 0;
+       struct inet_request_sock *ireq = inet_rsk(req);
+       char doing_ts;
+
 #ifdef CONFIG_TCP_MD5SIG
-       /*
-        * If MD5 is enabled, then we set the option, and include the size
-        * (always 18). The actual MD5 hash is added just before the
-        * packet is sent.
-        */
-       if (md5_hash) {
-               *ptr++ = htonl((TCPOPT_NOP << 24) |
-                              (TCPOPT_NOP << 16) |
-                              (TCPOPT_MD5SIG << 8) |
-                              TCPOLEN_MD5SIG);
-               *md5_hash = (__u8 *)ptr;
+       *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
+       if (*md5) {
+               opts->options |= OPTION_MD5;
+               size += TCPOLEN_MD5SIG_ALIGNED;
+       }
+#else
+       *md5 = NULL;
+#endif
+
+       /* we can't fit any SACK blocks in a packet with MD5 + TS
+          options. There was discussion about disabling SACK rather than TS in
+          order to fit in better with old, buggy kernels, but that was deemed
+          to be unnecessary. */
+       doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
+
+       opts->mss = mss;
+       size += TCPOLEN_MSS_ALIGNED;
+
+       if (likely(ireq->wscale_ok)) {
+               opts->ws = ireq->rcv_wscale;
+               if(likely(opts->ws))
+                       size += TCPOLEN_WSCALE_ALIGNED;
+       }
+       if (likely(doing_ts)) {
+               opts->options |= OPTION_TS;
+               opts->tsval = TCP_SKB_CB(skb)->when;
+               opts->tsecr = req->ts_recent;
+               size += TCPOLEN_TSTAMP_ALIGNED;
+       }
+       if (likely(ireq->sack_ok)) {
+               opts->options |= OPTION_SACK_ADVERTISE;
+               if (unlikely(!doing_ts))
+                       size += TCPOLEN_SACKPERM_ALIGNED;
+       }
+
+       return size;
+}
+
+static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
+                                       struct tcp_out_options *opts,
+                                       struct tcp_md5sig_key **md5) {
+       struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
+       struct tcp_sock *tp = tcp_sk(sk);
+       unsigned size = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+       *md5 = tp->af_specific->md5_lookup(sk, sk);
+       if (unlikely(*md5)) {
+               opts->options |= OPTION_MD5;
+               size += TCPOLEN_MD5SIG_ALIGNED;
        }
+#else
+       *md5 = NULL;
 #endif
+
+       if (likely(tp->rx_opt.tstamp_ok)) {
+               opts->options |= OPTION_TS;
+               opts->tsval = tcb ? tcb->when : 0;
+               opts->tsecr = tp->rx_opt.ts_recent;
+               size += TCPOLEN_TSTAMP_ALIGNED;
+       }
+
+       if (unlikely(tp->rx_opt.eff_sacks)) {
+               const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
+               opts->num_sack_blocks =
+                       min_t(unsigned, tp->rx_opt.eff_sacks,
+                             (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+                             TCPOLEN_SACK_PERBLOCK);
+               size += TCPOLEN_SACK_BASE_ALIGNED +
+                       opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+       }
+
+       return size;
 }
 
 /* This routine actually transmits TCP packets queued in by
@@ -473,13 +583,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        struct inet_sock *inet;
        struct tcp_sock *tp;
        struct tcp_skb_cb *tcb;
-       int tcp_header_size;
-#ifdef CONFIG_TCP_MD5SIG
+       struct tcp_out_options opts;
+       unsigned tcp_options_size, tcp_header_size;
        struct tcp_md5sig_key *md5;
        __u8 *md5_hash_location;
-#endif
        struct tcphdr *th;
-       int sysctl_flags;
        int err;
 
        BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -502,50 +610,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        inet = inet_sk(sk);
        tp = tcp_sk(sk);
        tcb = TCP_SKB_CB(skb);
-       tcp_header_size = tp->tcp_header_len;
+       memset(&opts, 0, sizeof(opts));
 
-#define SYSCTL_FLAG_TSTAMPS    0x1
-#define SYSCTL_FLAG_WSCALE     0x2
-#define SYSCTL_FLAG_SACK       0x4
-
-       sysctl_flags = 0;
-       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
-               tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
-               if (sysctl_tcp_timestamps) {
-                       tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
-                       sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
-               }
-               if (sysctl_tcp_window_scaling) {
-                       tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
-                       sysctl_flags |= SYSCTL_FLAG_WSCALE;
-               }
-               if (sysctl_tcp_sack) {
-                       sysctl_flags |= SYSCTL_FLAG_SACK;
-                       if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
-                               tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
-               }
-       } else if (unlikely(tp->rx_opt.eff_sacks)) {
-               /* A SACK is 2 pad bytes, a 2 byte header, plus
-                * 2 32-bit sequence numbers for each SACK block.
-                */
-               tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
-                                   (tp->rx_opt.eff_sacks *
-                                    TCPOLEN_SACK_PERBLOCK));
-       }
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
+               tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+       else
+               tcp_options_size = tcp_established_options(sk, skb, &opts,
+                                                          &md5);
+       tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
        if (tcp_packets_in_flight(tp) == 0)
                tcp_ca_event(sk, CA_EVENT_TX_START);
 
-#ifdef CONFIG_TCP_MD5SIG
-       /*
-        * Are we doing MD5 on this segment? If so - make
-        * room for it.
-        */
-       md5 = tp->af_specific->md5_lookup(sk, sk);
-       if (md5)
-               tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
-#endif
-
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
        skb_set_owner_w(skb, sk);
@@ -576,39 +652,16 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                th->urg                 = 1;
        }
 
-       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
-               tcp_syn_build_options((__be32 *)(th + 1),
-                                     tcp_advertise_mss(sk),
-                                     (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
-                                     (sysctl_flags & SYSCTL_FLAG_SACK),
-                                     (sysctl_flags & SYSCTL_FLAG_WSCALE),
-                                     tp->rx_opt.rcv_wscale,
-                                     tcb->when,
-                                     tp->rx_opt.ts_recent,
-
-#ifdef CONFIG_TCP_MD5SIG
-                                     md5 ? &md5_hash_location :
-#endif
-                                     NULL);
-       } else {
-               tcp_build_and_update_options((__be32 *)(th + 1),
-                                            tp, tcb->when,
-#ifdef CONFIG_TCP_MD5SIG
-                                            md5 ? &md5_hash_location :
-#endif
-                                            NULL);
+       tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+       if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
                TCP_ECN_send(sk, skb, tcp_header_size);
-       }
 
 #ifdef CONFIG_TCP_MD5SIG
        /* Calculate the MD5 hash, as we have all we need now */
        if (md5) {
+               sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
                tp->af_specific->calc_md5_hash(md5_hash_location,
-                                              md5,
-                                              sk, NULL, NULL,
-                                              tcp_hdr(skb),
-                                              sk->sk_protocol,
-                                              skb->len);
+                                              md5, sk, NULL, skb);
        }
 #endif
 
@@ -621,7 +674,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                tcp_event_data_sent(tp, skb, sk);
 
        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
-               TCP_INC_STATS(TCP_MIB_OUTSEGS);
+               TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
        err = icsk->icsk_af_ops->queue_xmit(skb, 0);
        if (likely(err <= 0))
@@ -630,10 +683,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        tcp_enter_cwr(sk, 1);
 
        return net_xmit_eval(err);
-
-#undef SYSCTL_FLAG_TSTAMPS
-#undef SYSCTL_FLAG_WSCALE
-#undef SYSCTL_FLAG_SACK
 }
 
 /* This routine just queue's the buffer
@@ -974,6 +1023,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
        u32 mss_now;
        u16 xmit_size_goal;
        int doing_tso = 0;
+       unsigned header_len;
+       struct tcp_out_options opts;
+       struct tcp_md5sig_key *md5;
 
        mss_now = tp->mss_cache;
 
@@ -986,14 +1038,16 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
                        mss_now = tcp_sync_mss(sk, mtu);
        }
 
-       if (tp->rx_opt.eff_sacks)
-               mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-                           (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
-
-#ifdef CONFIG_TCP_MD5SIG
-       if (tp->af_specific->md5_lookup(sk, sk))
-               mss_now -= TCPOLEN_MD5SIG_ALIGNED;
-#endif
+       header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+                    sizeof(struct tcphdr);
+       /* The mss_cache is sized based on tp->tcp_header_len, which assumes
+        * some common options. If this is an odd packet (because we have SACK
+        * blocks etc) then our calculated header_len will be different, and
+        * we have to adjust mss_now correspondingly */
+       if (header_len != tp->tcp_header_len) {
+               int delta = (int) header_len - tp->tcp_header_len;
+               mss_now -= delta;
+       }
 
        xmit_size_goal = mss_now;
 
@@ -1770,6 +1824,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
 
        /* changed transmit queue under us so clear hints */
        tcp_clear_retrans_hints_partial(tp);
+       if (next_skb == tp->retransmit_skb_hint)
+               tp->retransmit_skb_hint = skb;
 
        sk_wmem_free_skb(sk, next_skb);
 }
@@ -1784,7 +1840,7 @@ void tcp_simple_retransmit(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        unsigned int mss = tcp_current_mss(sk, 0);
-       int lost = 0;
+       u32 prior_lost = tp->lost_out;
 
        tcp_for_write_queue(skb, sk) {
                if (skb == tcp_send_head(sk))
@@ -1795,17 +1851,13 @@ void tcp_simple_retransmit(struct sock *sk)
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                                tp->retrans_out -= tcp_skb_pcount(skb);
                        }
-                       if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
-                               TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-                               tp->lost_out += tcp_skb_pcount(skb);
-                               lost = 1;
-                       }
+                       tcp_skb_mark_lost_uncond_verify(tp, skb);
                }
        }
 
-       tcp_clear_all_retrans_hints(tp);
+       tcp_clear_retrans_hints_partial(tp);
 
-       if (!lost)
+       if (prior_lost == tp->lost_out)
                return;
 
        if (tcp_is_reno(tp))
@@ -1913,7 +1965,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 
        if (err == 0) {
                /* Update global TCP statistics. */
-               TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+               TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
 
                tp->total_retrans++;
 
@@ -1942,83 +1994,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        return err;
 }
 
-/* This gets called after a retransmit timeout, and the initially
- * retransmitted data is acknowledged.  It tries to continue
- * resending the rest of the retransmit queue, until either
- * we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
- */
-void tcp_xmit_retransmit_queue(struct sock *sk)
+static int tcp_can_forward_retransmit(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb;
-       int packet_cnt;
-
-       if (tp->retransmit_skb_hint) {
-               skb = tp->retransmit_skb_hint;
-               packet_cnt = tp->retransmit_cnt_hint;
-       } else {
-               skb = tcp_write_queue_head(sk);
-               packet_cnt = 0;
-       }
-
-       /* First pass: retransmit lost packets. */
-       if (tp->lost_out) {
-               tcp_for_write_queue_from(skb, sk) {
-                       __u8 sacked = TCP_SKB_CB(skb)->sacked;
-
-                       if (skb == tcp_send_head(sk))
-                               break;
-                       /* we could do better than to assign each time */
-                       tp->retransmit_skb_hint = skb;
-                       tp->retransmit_cnt_hint = packet_cnt;
-
-                       /* Assume this retransmit will generate
-                        * only one packet for congestion window
-                        * calculation purposes.  This works because
-                        * tcp_retransmit_skb() will chop up the
-                        * packet to be MSS sized and all the
-                        * packet counting works out.
-                        */
-                       if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-                               return;
-
-                       if (sacked & TCPCB_LOST) {
-                               if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-                                       if (tcp_retransmit_skb(sk, skb)) {
-                                               tp->retransmit_skb_hint = NULL;
-                                               return;
-                                       }
-                                       if (icsk->icsk_ca_state != TCP_CA_Loss)
-                                               NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
-                                       else
-                                               NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
-
-                                       if (skb == tcp_write_queue_head(sk))
-                                               inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                                                         inet_csk(sk)->icsk_rto,
-                                                                         TCP_RTO_MAX);
-                               }
-
-                               packet_cnt += tcp_skb_pcount(skb);
-                               if (packet_cnt >= tp->lost_out)
-                                       break;
-                       }
-               }
-       }
-
-       /* OK, demanded retransmission is finished. */
 
        /* Forward retransmissions are possible only during Recovery. */
        if (icsk->icsk_ca_state != TCP_CA_Recovery)
-               return;
+               return 0;
 
        /* No forward retransmissions in Reno are possible. */
        if (tcp_is_reno(tp))
-               return;
+               return 0;
 
        /* Yeah, we have to make difficult choice between forward transmission
         * and retransmission... Both ways have their merits...
@@ -2029,43 +2016,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
         */
 
        if (tcp_may_send_now(sk))
-               return;
+               return 0;
 
-       /* If nothing is SACKed, highest_sack in the loop won't be valid */
-       if (!tp->sacked_out)
-               return;
+       return 1;
+}
 
-       if (tp->forward_skb_hint)
-               skb = tp->forward_skb_hint;
-       else
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged.  It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *skb;
+       struct sk_buff *hole = NULL;
+       u32 last_lost;
+       int mib_idx;
+       int fwd_rexmitting = 0;
+
+       if (!tp->lost_out)
+               tp->retransmit_high = tp->snd_una;
+
+       if (tp->retransmit_skb_hint) {
+               skb = tp->retransmit_skb_hint;
+               last_lost = TCP_SKB_CB(skb)->end_seq;
+               if (after(last_lost, tp->retransmit_high))
+                       last_lost = tp->retransmit_high;
+       } else {
                skb = tcp_write_queue_head(sk);
+               last_lost = tp->snd_una;
+       }
 
+       /* First pass: retransmit lost packets. */
        tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-               tp->forward_skb_hint = skb;
+               __u8 sacked = TCP_SKB_CB(skb)->sacked;
 
-               if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+               if (skb == tcp_send_head(sk))
                        break;
+               /* we could do better than to assign each time */
+               if (hole == NULL)
+                       tp->retransmit_skb_hint = skb;
 
+               /* Assume this retransmit will generate
+                * only one packet for congestion window
+                * calculation purposes.  This works because
+                * tcp_retransmit_skb() will chop up the
+                * packet to be MSS sized and all the
+                * packet counting works out.
+                */
                if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-                       break;
+                       return;
+
+               if (fwd_rexmitting) {
+begin_fwd:
+                       if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+                               break;
+                       mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
+
+               } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+                       tp->retransmit_high = last_lost;
+                       if (!tcp_can_forward_retransmit(sk))
+                               break;
+                       /* Backtrack if necessary to non-L'ed skb */
+                       if (hole != NULL) {
+                               skb = hole;
+                               hole = NULL;
+                       }
+                       fwd_rexmitting = 1;
+                       goto begin_fwd;
 
-               if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+               } else if (!(sacked & TCPCB_LOST)) {
+                       if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+                               hole = skb;
                        continue;
 
-               /* Ok, retransmit it. */
-               if (tcp_retransmit_skb(sk, skb)) {
-                       tp->forward_skb_hint = NULL;
-                       break;
+               } else {
+                       last_lost = TCP_SKB_CB(skb)->end_seq;
+                       if (icsk->icsk_ca_state != TCP_CA_Loss)
+                               mib_idx = LINUX_MIB_TCPFASTRETRANS;
+                       else
+                               mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
                }
 
+               if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+                       continue;
+
+               if (tcp_retransmit_skb(sk, skb))
+                       return;
+               NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
                if (skb == tcp_write_queue_head(sk))
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                  inet_csk(sk)->icsk_rto,
                                                  TCP_RTO_MAX);
-
-               NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
        }
 }
 
@@ -2119,7 +2167,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
        /* NOTE: No TCP options attached and we never retransmit this. */
        skb = alloc_skb(MAX_TCP_HEADER, priority);
        if (!skb) {
-               NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
                return;
        }
 
@@ -2130,9 +2178,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
        /* Send it off. */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        if (tcp_transmit_skb(sk, skb, 0, priority))
-               NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
 
-       TCP_INC_STATS(TCP_MIB_OUTRSTS);
+       TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
 }
 
 /* WARNING: This routine must only be called when we have already sent
@@ -2180,11 +2228,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcphdr *th;
        int tcp_header_size;
+       struct tcp_out_options opts;
        struct sk_buff *skb;
-#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *md5;
        __u8 *md5_hash_location;
-#endif
 
        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
        if (skb == NULL)
@@ -2195,18 +2242,27 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
        skb->dst = dst_clone(dst);
 
-       tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
-                          (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
-                          (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
-                          /* SACK_PERM is in the place of NOP NOP of TS */
-                          ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+       if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+               __u8 rcv_wscale;
+               /* Set this up on the first call only */
+               req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+               /* tcp_full_space because it is guaranteed to be the first packet */
+               tcp_select_initial_window(tcp_full_space(sk),
+                       dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+                       &req->rcv_wnd,
+                       &req->window_clamp,
+                       ireq->wscale_ok,
+                       &rcv_wscale);
+               ireq->rcv_wscale = rcv_wscale;
+       }
+
+       memset(&opts, 0, sizeof(opts));
+       TCP_SKB_CB(skb)->when = tcp_time_stamp;
+       tcp_header_size = tcp_synack_options(sk, req,
+                                            dst_metric(dst, RTAX_ADVMSS),
+                                            skb, &opts, &md5) +
+                         sizeof(struct tcphdr);
 
-#ifdef CONFIG_TCP_MD5SIG
-       /* Are we doing MD5 on this segment? If so - make room for it */
-       md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
-       if (md5)
-               tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
-#endif
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
 
@@ -2224,19 +2280,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                             TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
-       if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
-               __u8 rcv_wscale;
-               /* Set this up on the first call only */
-               req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
-               /* tcp_full_space because it is guaranteed to be the first packet */
-               tcp_select_initial_window(tcp_full_space(sk),
-                       dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
-                       &req->rcv_wnd,
-                       &req->window_clamp,
-                       ireq->wscale_ok,
-                       &rcv_wscale);
-               ireq->rcv_wscale = rcv_wscale;
-       }
 
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rcv_wnd, 65535U));
@@ -2245,29 +2288,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
        else
 #endif
-       TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
-                             ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
-                             TCP_SKB_CB(skb)->when,
-                             req->ts_recent,
-                             (
-#ifdef CONFIG_TCP_MD5SIG
-                              md5 ? &md5_hash_location :
-#endif
-                              NULL)
-                             );
-
+       tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
        th->doff = (tcp_header_size >> 2);
-       TCP_INC_STATS(TCP_MIB_OUTSEGS);
+       TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
 #ifdef CONFIG_TCP_MD5SIG
        /* Okay, we have all we need - do the md5 hash if needed */
        if (md5) {
                tp->af_specific->calc_md5_hash(md5_hash_location,
-                                              md5,
-                                              NULL, dst, req,
-                                              tcp_hdr(skb), sk->sk_protocol,
-                                              skb->len);
+                                              md5, NULL, req, skb);
        }
 #endif
 
@@ -2367,7 +2396,7 @@ int tcp_connect(struct sock *sk)
         */
        tp->snd_nxt = tp->write_seq;
        tp->pushed_seq = tp->write_seq;
-       TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
+       TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
 
        /* Timer for repeating the SYN until an answer. */
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,