tcp: Restore ordering of TCP options for the sake of inter-operability

[linux-2.6-omap-h63xx.git] / net / ipv4 / tcp_output.c
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 8165f5aa8c71b68f5d01dee2abb8dad5c7449c44..e4c5ac9fe89bf6c693690f22e5e0e4160443acd4 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -345,6 +345,11 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
         TCP_SKB_CB(skb)->end_seq = seq;
  }
  
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+       return tp->snd_una != tp->snd_up;
+}
+
  #define OPTION_SACK_ADVERTISE  (1 << 0)
  #define OPTION_TS              (1 << 1)
  #define OPTION_MD5             (1 << 2)
@@ -357,6 +362,17 @@ struct tcp_out_options {
         __u32 tsval, tsecr;     /* need to include OPTION_TS */
  };
  
+/* Beware: Something in the Internet is very sensitive to the ordering of
+ * TCP options, we learned this through the hard way, so be careful here.
+ * Luckily we can at least blame others for their non-compliance but from
+ * inter-operatibility perspective it seems that we're somewhat stuck with
+ * the ordering which we have been using if we want to keep working with
+ * those broken things (not that it currently hurts anybody as there isn't
+ * particular reason why the ordering would need to be changed).
+ *
+ * At least SACK_PERM as the first option is known to lead to a disaster
+ * (but it may well be that other scenarios fail similarly).
+ */
  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                               const struct tcp_out_options *opts,
                               __u8 **md5_hash) {
@@ -371,6 +387,12 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                 *md5_hash = NULL;
         }
  
+       if (unlikely(opts->mss)) {
+               *ptr++ = htonl((TCPOPT_MSS << 24) |
+                              (TCPOLEN_MSS << 16) |
+                              opts->mss);
+       }
+
         if (likely(OPTION_TS & opts->options)) {
                 if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
                         *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
@@ -387,12 +409,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                 *ptr++ = htonl(opts->tsecr);
         }
  
-       if (unlikely(opts->mss)) {
-               *ptr++ = htonl((TCPOPT_MSS << 24) |
-                              (TCPOLEN_MSS << 16) |
-                              opts->mss);
-       }
-
         if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
                      !(OPTION_TS & opts->options))) {
                 *ptr++ = htonl((TCPOPT_NOP << 24) |
@@ -427,7 +443,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
  
                 if (tp->rx_opt.dsack) {
                         tp->rx_opt.dsack = 0;
-                       tp->rx_opt.eff_sacks--;
+                       tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
                 }
         }
  }
@@ -646,7 +662,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
         th->check               = 0;
         th->urg_ptr             = 0;
  
-       if (unlikely(tp->urg_mode &&
+       /* The urg_mode check is necessary during a below snd_una win probe */
+       if (unlikely(tcp_urg_mode(tp) &&
                      between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
                 th->urg_ptr             = htons(tp->snd_up - tcb->seq);
                 th->urg                 = 1;
@@ -1012,7 +1029,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  /* Compute the current effective MSS, taking SACKs and IP options,
   * and even PMTU discovery events into account.
   *
- * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up
   * cannot be large. However, taking into account rare use of URG, this
   * is not a big flaw.
   */
@@ -1029,7 +1046,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
  
         mss_now = tp->mss_cache;
  
-       if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
+       if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp))
                 doing_tso = 1;
  
         if (dst) {
@@ -1193,7 +1210,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
         /* Don't use the nagle rule for urgent data (or for the final FIN).
          * Nagle can be ignored during F-RTO too (see RFC4138).
          */
-       if (tp->urg_mode || (tp->frto_counter == 2) ||
+       if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
             (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
                 return 1;
  
@@ -1824,6 +1841,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
  
         /* changed transmit queue under us so clear hints */
         tcp_clear_retrans_hints_partial(tp);
+       if (next_skb == tp->retransmit_skb_hint)
+               tp->retransmit_skb_hint = skb;
  
         sk_wmem_free_skb(sk, next_skb);
  }
@@ -1838,7 +1857,7 @@ void tcp_simple_retransmit(struct sock *sk)
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
         unsigned int mss = tcp_current_mss(sk, 0);
-       int lost = 0;
+       u32 prior_lost = tp->lost_out;
  
         tcp_for_write_queue(skb, sk) {
                 if (skb == tcp_send_head(sk))
@@ -1849,17 +1868,13 @@ void tcp_simple_retransmit(struct sock *sk)
                                 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                                 tp->retrans_out -= tcp_skb_pcount(skb);
                         }
-                       if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
-                               TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-                               tp->lost_out += tcp_skb_pcount(skb);
-                               lost = 1;
-                       }
+                       tcp_skb_mark_lost_uncond_verify(tp, skb);
                 }
         }
  
-       tcp_clear_all_retrans_hints(tp);
+       tcp_clear_retrans_hints_partial(tp);
  
-       if (!lost)
+       if (prior_lost == tp->lost_out)
                 return;
  
         if (tcp_is_reno(tp))
@@ -1934,8 +1949,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         /* Collapse two adjacent packets if worthwhile and we can. */
         if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
             (skb->len < (cur_mss >> 1)) &&
-           (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
             (!tcp_skb_is_last(sk, skb)) &&
+           (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
             (skb_shinfo(skb)->nr_frags == 0 &&
              skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
             (tcp_skb_pcount(skb) == 1 &&
@@ -1996,86 +2011,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         return err;
  }
  
-/* This gets called after a retransmit timeout, and the initially
- * retransmitted data is acknowledged.  It tries to continue
- * resending the rest of the retransmit queue, until either
- * we've sent it all or the congestion window limit is reached.
- * If doing SACK, the first ACK which comes back for a timeout
- * based retransmit packet might feed us FACK information again.
- * If so, we use it to avoid unnecessarily retransmissions.
- */
-void tcp_xmit_retransmit_queue(struct sock *sk)
+static int tcp_can_forward_retransmit(struct sock *sk)
  {
         const struct inet_connection_sock *icsk = inet_csk(sk);
         struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb;
-       int packet_cnt;
-
-       if (tp->retransmit_skb_hint) {
-               skb = tp->retransmit_skb_hint;
-               packet_cnt = tp->retransmit_cnt_hint;
-       } else {
-               skb = tcp_write_queue_head(sk);
-               packet_cnt = 0;
-       }
-
-       /* First pass: retransmit lost packets. */
-       if (tp->lost_out) {
-               tcp_for_write_queue_from(skb, sk) {
-                       __u8 sacked = TCP_SKB_CB(skb)->sacked;
-
-                       if (skb == tcp_send_head(sk))
-                               break;
-                       /* we could do better than to assign each time */
-                       tp->retransmit_skb_hint = skb;
-                       tp->retransmit_cnt_hint = packet_cnt;
-
-                       /* Assume this retransmit will generate
-                        * only one packet for congestion window
-                        * calculation purposes.  This works because
-                        * tcp_retransmit_skb() will chop up the
-                        * packet to be MSS sized and all the
-                        * packet counting works out.
-                        */
-                       if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-                               return;
-
-                       if (sacked & TCPCB_LOST) {
-                               if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-                                       int mib_idx;
-
-                                       if (tcp_retransmit_skb(sk, skb)) {
-                                               tp->retransmit_skb_hint = NULL;
-                                               return;
-                                       }
-                                       if (icsk->icsk_ca_state != TCP_CA_Loss)
-                                               mib_idx = LINUX_MIB_TCPFASTRETRANS;
-                                       else
-                                               mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
-                                       NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
-                                       if (skb == tcp_write_queue_head(sk))
-                                               inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                                                         inet_csk(sk)->icsk_rto,
-                                                                         TCP_RTO_MAX);
-                               }
-
-                               packet_cnt += tcp_skb_pcount(skb);
-                               if (packet_cnt >= tp->lost_out)
-                                       break;
-                       }
-               }
-       }
-
-       /* OK, demanded retransmission is finished. */
  
         /* Forward retransmissions are possible only during Recovery. */
         if (icsk->icsk_ca_state != TCP_CA_Recovery)
-               return;
+               return 0;
  
         /* No forward retransmissions in Reno are possible. */
         if (tcp_is_reno(tp))
-               return;
+               return 0;
  
         /* Yeah, we have to make difficult choice between forward transmission
          * and retransmission... Both ways have their merits...
@@ -2086,43 +2033,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
          */
  
         if (tcp_may_send_now(sk))
-               return;
+               return 0;
  
-       /* If nothing is SACKed, highest_sack in the loop won't be valid */
-       if (!tp->sacked_out)
-               return;
+       return 1;
+}
  
-       if (tp->forward_skb_hint)
-               skb = tp->forward_skb_hint;
-       else
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged.  It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *skb;
+       struct sk_buff *hole = NULL;
+       u32 last_lost;
+       int mib_idx;
+       int fwd_rexmitting = 0;
+
+       if (!tp->lost_out)
+               tp->retransmit_high = tp->snd_una;
+
+       if (tp->retransmit_skb_hint) {
+               skb = tp->retransmit_skb_hint;
+               last_lost = TCP_SKB_CB(skb)->end_seq;
+               if (after(last_lost, tp->retransmit_high))
+                       last_lost = tp->retransmit_high;
+       } else {
                 skb = tcp_write_queue_head(sk);
+               last_lost = tp->snd_una;
+       }
  
+       /* First pass: retransmit lost packets. */
         tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-               tp->forward_skb_hint = skb;
+               __u8 sacked = TCP_SKB_CB(skb)->sacked;
  
-               if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+               if (skb == tcp_send_head(sk))
                         break;
+               /* we could do better than to assign each time */
+               if (hole == NULL)
+                       tp->retransmit_skb_hint = skb;
  
+               /* Assume this retransmit will generate
+                * only one packet for congestion window
+                * calculation purposes.  This works because
+                * tcp_retransmit_skb() will chop up the
+                * packet to be MSS sized and all the
+                * packet counting works out.
+                */
                 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
-                       break;
+                       return;
+
+               if (fwd_rexmitting) {
+begin_fwd:
+                       if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+                               break;
+                       mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
  
-               if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+               } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+                       tp->retransmit_high = last_lost;
+                       if (!tcp_can_forward_retransmit(sk))
+                               break;
+                       /* Backtrack if necessary to non-L'ed skb */
+                       if (hole != NULL) {
+                               skb = hole;
+                               hole = NULL;
+                       }
+                       fwd_rexmitting = 1;
+                       goto begin_fwd;
+
+               } else if (!(sacked & TCPCB_LOST)) {
+                       if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+                               hole = skb;
                         continue;
  
-               /* Ok, retransmit it. */
-               if (tcp_retransmit_skb(sk, skb)) {
-                       tp->forward_skb_hint = NULL;
-                       break;
+               } else {
+                       last_lost = TCP_SKB_CB(skb)->end_seq;
+                       if (icsk->icsk_ca_state != TCP_CA_Loss)
+                               mib_idx = LINUX_MIB_TCPFASTRETRANS;
+                       else
+                               mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
                 }
  
+               if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+                       continue;
+
+               if (tcp_retransmit_skb(sk, skb))
+                       return;
+               NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
                 if (skb == tcp_write_queue_head(sk))
                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                   inet_csk(sk)->icsk_rto,
                                                   TCP_RTO_MAX);
-
-               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS);
         }
  }
  
@@ -2241,6 +2249,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
         struct sk_buff *skb;
         struct tcp_md5sig_key *md5;
         __u8 *md5_hash_location;
+       int mss;
  
         skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
         if (skb == NULL)
@@ -2251,13 +2260,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
  
         skb->dst = dst_clone(dst);
  
+       mss = dst_metric(dst, RTAX_ADVMSS);
+       if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+               mss = tp->rx_opt.user_mss;
+
         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
                 __u8 rcv_wscale;
                 /* Set this up on the first call only */
                 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
                 /* tcp_full_space because it is guaranteed to be the first packet */
                 tcp_select_initial_window(tcp_full_space(sk),
-                       dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+                       mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
                         &req->rcv_wnd,
                         &req->window_clamp,
                         ireq->wscale_ok,
@@ -2267,8 +2280,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
  
         memset(&opts, 0, sizeof(opts));
         TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       tcp_header_size = tcp_synack_options(sk, req,
-                                            dst_metric(dst, RTAX_ADVMSS),
+       tcp_header_size = tcp_synack_options(sk, req, mss,
                                              skb, &opts, &md5) +
                           sizeof(struct tcphdr);
  
@@ -2280,7 +2292,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
         th->syn = 1;
         th->ack = 1;
         TCP_ECN_make_synack(req, th);
-       th->source = inet_sk(sk)->sport;
+       th->source = ireq->loc_port;
         th->dest = ireq->rmt_port;
         /* Setting of flags are superfluous here for callers (and ECE is
          * not even correctly set)
@@ -2342,6 +2354,9 @@ static void tcp_connect_init(struct sock *sk)
         if (!tp->window_clamp)
                 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
         tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+       if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
+               tp->advmss = tp->rx_opt.user_mss;
+
         tcp_initialize_rcv_mss(sk);
  
         tcp_select_initial_window(tcp_full_space(sk),
@@ -2360,6 +2375,7 @@ static void tcp_connect_init(struct sock *sk)
         tcp_init_wl(tp, tp->write_seq, 0);
         tp->snd_una = tp->write_seq;
         tp->snd_sml = tp->write_seq;
+       tp->snd_up = tp->write_seq;
         tp->rcv_nxt = 0;
         tp->rcv_wup = 0;
         tp->copied_seq = 0;
@@ -2569,8 +2585,7 @@ int tcp_write_wakeup(struct sock *sk)
                         tcp_event_new_data_sent(sk, skb);
                 return err;
         } else {
-               if (tp->urg_mode &&
-                   between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+               if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
                         tcp_xmit_probe_skb(sk, 1);
                 return tcp_xmit_probe_skb(sk, 0);
         }