]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland...
authorLinus Torvalds <torvalds@woody.linux-foundation.org>
Fri, 12 Oct 2007 02:43:13 +0000 (19:43 -0700)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Fri, 12 Oct 2007 02:43:13 +0000 (19:43 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (87 commits)
  mlx4_core: Fix section mismatches
  IPoIB: Allow setting policy to ignore multicast groups
  IB/mthca: Mark error paths as unlikely() in post_srq_recv functions
  IB/ipath: Minor fix to ordering of freeing and zeroing of tid pages.
  IB/ipath: Remove redundant link state checks
  IB/ipath: Fix IB_EVENT_PORT_ERR event
  IB/ipath: Better handling of unexpected GPIO interrupts
  IB/ipath: Maintain active time on all chips
  IB/ipath: Fix QHT7040 serial number check
  IB/ipath: Indicate a couple of chip bugs to userspace
  IB/ipath: iba6110 rev4 no longer needs recv header overrun workaround
  IB/ipath: Use counters in ipath_poll and cleanup interrupts in ipath_close
  IB/ipath: Remove duplicate copy of LMC
  IB/ipath: Add ability to set the LMC via the sysfs debugging interface
  IB/ipath: Optimize completion queue entry insertion and polling
  IB/ipath: Implement IB_EVENT_QP_LAST_WQE_REACHED
  IB/ipath: Generate flush CQE when QP is in error state
  IB/ipath: Remove redundant code
  IB/ipath: Future proof eeprom checksum code (contents reading)
  IB/ipath: UC RDMA WRITE with IMMEDIATE doesn't send the immediate
  ...

1  2 
drivers/infiniband/core/cma.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c

index 2e641b255db48b197ab51bcb49e2dcde94e1aae3,72539529fa661f413accc47460fde6258839f542..93644f82592c426074a9beba068b4e23d4910b79
@@@ -52,6 -52,7 +52,7 @@@ MODULE_LICENSE("Dual BSD/GPL")
  
  #define CMA_CM_RESPONSE_TIMEOUT 20
  #define CMA_MAX_CM_RETRIES 15
+ #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
  
  static void cma_add_one(struct ib_device *device);
  static void cma_remove_one(struct ib_device *device);
@@@ -138,6 -139,7 +139,7 @@@ struct rdma_id_private 
        u32                     qkey;
        u32                     qp_num;
        u8                      srq;
+       u8                      tos;
  };
  
  struct cma_multicast {
@@@ -1089,6 -1091,7 +1091,7 @@@ static int cma_req_handler(struct ib_cm
                event.param.ud.private_data_len =
                                IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
        } else {
+               ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
                conn_id = cma_new_conn_id(&listen_id->id, ib_event);
                cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
                                       ib_event->private_data, offset);
  }
  EXPORT_SYMBOL(rdma_listen);
  
+ void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+ {
+       struct rdma_id_private *id_priv;
+       id_priv = container_of(id, struct rdma_id_private, id);
+       id_priv->tos = (u8) tos;
+ }
+ EXPORT_SYMBOL(rdma_set_service_type);
  static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
                              void *context)
  {
  static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
                              struct cma_work *work)
  {
-       struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
+       struct rdma_addr *addr = &id_priv->id.route.addr;
        struct ib_sa_path_rec path_rec;
+       ib_sa_comp_mask comp_mask;
+       struct sockaddr_in6 *sin6;
  
        memset(&path_rec, 0, sizeof path_rec);
-       ib_addr_get_sgid(addr, &path_rec.sgid);
-       ib_addr_get_dgid(addr, &path_rec.dgid);
-       path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
+       ib_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
+       ib_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
+       path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
        path_rec.numb_path = 1;
        path_rec.reversible = 1;
+       path_rec.service_id = cma_get_service_id(id_priv->id.ps, &addr->dst_addr);
+       comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+                   IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+                   IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+       if (addr->src_addr.sa_family == AF_INET) {
+               path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+               comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+       } else {
+               sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+               path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+               comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+       }
  
        id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
-                               id_priv->id.port_num, &path_rec,
-                               IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
-                               IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
-                               IB_SA_PATH_REC_REVERSIBLE,
-                               timeout_ms, GFP_KERNEL,
-                               cma_query_handler, work, &id_priv->query);
+                                              id_priv->id.port_num, &path_rec,
+                                              comp_mask, timeout_ms,
+                                              GFP_KERNEL, cma_query_handler,
+                                              work, &id_priv->query);
  
        return (id_priv->query_id < 0) ? id_priv->query_id : 0;
  }
@@@ -1866,14 -1892,13 +1892,14 @@@ err1
  static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
  {
        struct rdma_bind_list *bind_list;
 -      int port, ret;
 +      int port, ret, low, high;
  
        bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
        if (!bind_list)
                return -ENOMEM;
  
  retry:
 +      /* FIXME: add proper port randomization per like inet_csk_get_port */
        do {
                ret = idr_get_new_above(ps, bind_list, next_port, &port);
        } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
        if (ret)
                goto err1;
  
 -      if (port > sysctl_local_port_range[1]) {
 -              if (next_port != sysctl_local_port_range[0]) {
 +      inet_get_local_port_range(&low, &high);
 +      if (port > high) {
 +              if (next_port != low) {
                        idr_remove(ps, port);
 -                      next_port = sysctl_local_port_range[0];
 +                      next_port = low;
                        goto retry;
                }
                ret = -EADDRNOTAVAIL;
                goto err2;
        }
  
 -      if (port == sysctl_local_port_range[1])
 -              next_port = sysctl_local_port_range[0];
 +      if (port == high)
 +              next_port = low;
        else
                next_port = port + 1;
  
@@@ -2771,12 -2795,12 +2797,12 @@@ static void cma_remove_one(struct ib_de
  
  static int cma_init(void)
  {
 -      int ret;
 +      int ret, low, high;
  
        get_random_bytes(&next_port, sizeof next_port);
 -      next_port = ((unsigned int) next_port %
 -                  (sysctl_local_port_range[1] - sysctl_local_port_range[0])) +
 -                  sysctl_local_port_range[0];
 +      inet_get_local_port_range(&low, &high);
 +      next_port = ((unsigned int) next_port % (high - low)) + low;
 +
        cma_wq = create_singlethread_workqueue("rdma_cm");
        if (!cma_wq)
                return -ENOMEM;
index 34c6128d2a34836902307260ec6d158eea24e30b,a198ce8371db3520d9deadb4c382be562cd5aa19..6545fa798b12664e96be153eaaec305249c2c9d8
@@@ -86,6 -86,7 +86,7 @@@ enum 
        IPOIB_MCAST_STARTED       = 8,
        IPOIB_FLAG_NETIF_STOPPED  = 9,
        IPOIB_FLAG_ADMIN_CM       = 10,
+       IPOIB_FLAG_UMCAST         = 11,
  
        IPOIB_MAX_BACKOFF_SECONDS = 16,
  
@@@ -113,7 -114,27 +114,27 @@@ struct ipoib_pseudoheader 
        u8  hwaddr[INFINIBAND_ALEN];
  };
  
- struct ipoib_mcast;
+ /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+ struct ipoib_mcast {
+       struct ib_sa_mcmember_rec mcmember;
+       struct ib_sa_multicast   *mc;
+       struct ipoib_ah          *ah;
+       struct rb_node    rb_node;
+       struct list_head  list;
+       unsigned long created;
+       unsigned long backoff;
+       unsigned long flags;
+       unsigned char logcount;
+       struct list_head  neigh_list;
+       struct sk_buff_head pkt_queue;
+       struct net_device *dev;
+ };
  
  struct ipoib_rx_buf {
        struct sk_buff *skb;
@@@ -228,8 -249,6 +249,8 @@@ struct ipoib_dev_priv 
  
        struct net_device *dev;
  
 +      struct napi_struct napi;
 +
        unsigned long flags;
  
        struct mutex mcast_mutex;
  
        struct ib_event_handler event_handler;
  
 -      struct net_device_stats stats;
 -
        struct net_device *parent;
        struct list_head child_intfs;
        struct list_head list;
@@@ -351,7 -372,7 +372,7 @@@ extern struct workqueue_struct *ipoib_w
  
  /* functions */
  
 -int ipoib_poll(struct net_device *dev, int *budget);
 +int ipoib_poll(struct napi_struct *napi, int budget);
  void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
  
  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@@ -364,6 -385,7 +385,7 @@@ static inline void ipoib_put_ah(struct 
  
  int ipoib_open(struct net_device *dev);
  int ipoib_add_pkey_attr(struct net_device *dev);
+ int ipoib_add_umcast_attr(struct net_device *dev);
  
  void ipoib_send(struct net_device *dev, struct sk_buff *skb,
                struct ipoib_ah *address, u32 qpn);
index 1afd93cdd6bbb759bafd3878cbb962a0798efce3,23addb3a6f4e8d12c2f5d34fce6f0659ca3198b6..0a0dcb8fdfd1ac2ca54ec326974341a6ff187baa
@@@ -430,7 -430,7 +430,7 @@@ void ipoib_cm_handle_rx_wc(struct net_d
                ipoib_dbg(priv, "cm recv error "
                           "(status=%d, wrid=%d vend_err %x)\n",
                           wc->status, wr_id, wc->vendor_err);
 -              ++priv->stats.rx_dropped;
 +              ++dev->stats.rx_dropped;
                goto repost;
        }
  
                 * this packet and reuse the old buffer.
                 */
                ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
 -              ++priv->stats.rx_dropped;
 +              ++dev->stats.rx_dropped;
                goto repost;
        }
  
        skb_pull(skb, IPOIB_ENCAP_LEN);
  
        dev->last_rx = jiffies;
 -      ++priv->stats.rx_packets;
 -      priv->stats.rx_bytes += skb->len;
 +      ++dev->stats.rx_packets;
 +      dev->stats.rx_bytes += skb->len;
  
        skb->dev = dev;
        /* XXX get correct PACKET_ type here */
@@@ -512,8 -512,8 +512,8 @@@ void ipoib_cm_send(struct net_device *d
        if (unlikely(skb->len > tx->mtu)) {
                ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
                           skb->len, tx->mtu);
 -              ++priv->stats.tx_dropped;
 -              ++priv->stats.tx_errors;
 +              ++dev->stats.tx_dropped;
 +              ++dev->stats.tx_errors;
                ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
                return;
        }
        tx_req->skb = skb;
        addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
        if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 -              ++priv->stats.tx_errors;
 +              ++dev->stats.tx_errors;
                dev_kfree_skb_any(skb);
                return;
        }
        if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
                                addr, skb->len))) {
                ipoib_warn(priv, "post_send failed\n");
 -              ++priv->stats.tx_errors;
 +              ++dev->stats.tx_errors;
                ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
                dev_kfree_skb_any(skb);
        } else {
@@@ -580,8 -580,8 +580,8 @@@ static void ipoib_cm_handle_tx_wc(struc
        ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
  
        /* FIXME: is this right? Shouldn't we only increment on success? */
 -      ++priv->stats.tx_packets;
 -      priv->stats.tx_bytes += tx_req->skb->len;
 +      ++dev->stats.tx_packets;
 +      dev->stats.tx_bytes += tx_req->skb->len;
  
        dev_kfree_skb_any(tx_req->skb);
  
@@@ -810,14 -810,16 +810,16 @@@ static int ipoib_cm_rep_handler(struct 
  static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
  {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       struct ib_qp_init_attr attr = {};
-       attr.recv_cq = priv->cq;
-       attr.srq = priv->cm.srq;
-       attr.cap.max_send_wr = ipoib_sendq_size;
-       attr.cap.max_send_sge = 1;
-       attr.sq_sig_type = IB_SIGNAL_ALL_WR;
-       attr.qp_type = IB_QPT_RC;
-       attr.send_cq = cq;
+       struct ib_qp_init_attr attr = {
+               .send_cq                = cq,
+               .recv_cq                = priv->cq,
+               .srq                    = priv->cm.srq,
+               .cap.max_send_wr        = ipoib_sendq_size,
+               .cap.max_send_sge       = 1,
+               .sq_sig_type            = IB_SIGNAL_ALL_WR,
+               .qp_type                = IB_QPT_RC,
+         };
        return ib_create_qp(priv->pd, &attr);
  }
  
index 0ec28c302fbf154c999541f4f14f477668530b3e,5a70e287f25d5592c9bd7cf6b3d972bf63869708..1a77e79f6b432748accdbb9a813f9a257aa4c1c6
@@@ -208,7 -208,7 +208,7 @@@ static void ipoib_ib_handle_rx_wc(struc
         * this packet and reuse the old buffer.
         */
        if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
 -              ++priv->stats.rx_dropped;
 +              ++dev->stats.rx_dropped;
                goto repost;
        }
  
        skb_pull(skb, IPOIB_ENCAP_LEN);
  
        dev->last_rx = jiffies;
 -      ++priv->stats.rx_packets;
 -      priv->stats.rx_bytes += skb->len;
 +      ++dev->stats.rx_packets;
 +      dev->stats.rx_bytes += skb->len;
  
        skb->dev = dev;
        /* XXX get correct PACKET_ type here */
@@@ -260,8 -260,8 +260,8 @@@ static void ipoib_ib_handle_tx_wc(struc
        ib_dma_unmap_single(priv->ca, tx_req->mapping,
                            tx_req->skb->len, DMA_TO_DEVICE);
  
 -      ++priv->stats.tx_packets;
 -      priv->stats.tx_bytes += tx_req->skb->len;
 +      ++dev->stats.tx_packets;
 +      dev->stats.tx_bytes += tx_req->skb->len;
  
        dev_kfree_skb_any(tx_req->skb);
  
                           wc->status, wr_id, wc->vendor_err);
  }
  
 -int ipoib_poll(struct net_device *dev, int *budget)
 +int ipoib_poll(struct napi_struct *napi, int budget)
  {
 -      struct ipoib_dev_priv *priv = netdev_priv(dev);
 -      int max = min(*budget, dev->quota);
 +      struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
 +      struct net_device *dev = priv->dev;
        int done;
        int t;
 -      int empty;
        int n, i;
  
        done  = 0;
 -      empty = 0;
  
 -      while (max) {
 +poll_more:
 +      while (done < budget) {
 +              int max = (budget - done);
 +
                t = min(IPOIB_NUM_WC, max);
                n = ib_poll_cq(priv->cq, t, priv->ibwc);
  
 -              for (i = 0; i < n; ++i) {
 +              for (i = 0; i < n; i++) {
                        struct ib_wc *wc = priv->ibwc + i;
  
                        if (wc->wr_id & IPOIB_CM_OP_SRQ) {
                                ++done;
 -                              --max;
                                ipoib_cm_handle_rx_wc(dev, wc);
                        } else if (wc->wr_id & IPOIB_OP_RECV) {
                                ++done;
 -                              --max;
                                ipoib_ib_handle_rx_wc(dev, wc);
                        } else
                                ipoib_ib_handle_tx_wc(dev, wc);
                }
  
 -              if (n != t) {
 -                      empty = 1;
 +              if (n != t)
                        break;
 -              }
        }
  
 -      dev->quota -= done;
 -      *budget    -= done;
 -
 -      if (empty) {
 -              netif_rx_complete(dev);
 +      if (done < budget) {
 +              netif_rx_complete(dev, napi);
                if (unlikely(ib_req_notify_cq(priv->cq,
                                              IB_CQ_NEXT_COMP |
                                              IB_CQ_REPORT_MISSED_EVENTS)) &&
 -                  netif_rx_reschedule(dev, 0))
 -                      return 1;
 -
 -              return 0;
 +                  netif_rx_reschedule(dev, napi))
 +                      goto poll_more;
        }
  
 -      return 1;
 +      return done;
  }
  
  void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
  {
 -      netif_rx_schedule(dev_ptr);
 +      struct net_device *dev = dev_ptr;
 +      struct ipoib_dev_priv *priv = netdev_priv(dev);
 +
 +      netif_rx_schedule(dev, &priv->napi);
  }
  
  static inline int post_send(struct ipoib_dev_priv *priv,
@@@ -362,8 -367,8 +362,8 @@@ void ipoib_send(struct net_device *dev
        if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
                ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
                           skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
 -              ++priv->stats.tx_dropped;
 -              ++priv->stats.tx_errors;
 +              ++dev->stats.tx_dropped;
 +              ++dev->stats.tx_errors;
                ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
                return;
        }
        addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
                                 DMA_TO_DEVICE);
        if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 -              ++priv->stats.tx_errors;
 +              ++dev->stats.tx_errors;
                dev_kfree_skb_any(skb);
                return;
        }
        if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
                               address->ah, qpn, addr, skb->len))) {
                ipoib_warn(priv, "post_send failed\n");
 -              ++priv->stats.tx_errors;
 +              ++dev->stats.tx_errors;
                ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
                dev_kfree_skb_any(skb);
        } else {
@@@ -553,6 -558,14 +553,14 @@@ void ipoib_drain_cq(struct net_device *
        do {
                n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
                for (i = 0; i < n; ++i) {
+                       /*
+                        * Convert any successful completions to flush
+                        * errors to avoid passing packets up the
+                        * stack after bringing the device down.
+                        */
+                       if (priv->ibwc[i].status == IB_WC_SUCCESS)
+                               priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
                        if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
                                ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
                        else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
@@@ -572,6 -585,7 +580,6 @@@ int ipoib_ib_dev_stop(struct net_devic
        int i;
  
        clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 -      netif_poll_disable(dev);
  
        ipoib_cm_dev_stop(dev);
  
@@@ -654,6 -668,7 +662,6 @@@ timeout
                msleep(1);
        }
  
 -      netif_poll_enable(dev);
        ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP);
  
        return 0;
index 855c9deca8b716569d7014a0117ad220d00b6c73,ff17fe3c765bd720b7601e6e6a9a1e5f8d4949ce..e072f3c32ce6f307aa3bdaf7935557b4bc5a53d0
@@@ -98,20 -98,16 +98,20 @@@ int ipoib_open(struct net_device *dev
  
        ipoib_dbg(priv, "bringing up interface\n");
  
 +      napi_enable(&priv->napi);
        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  
        if (ipoib_pkey_dev_delay_open(dev))
                return 0;
  
 -      if (ipoib_ib_dev_open(dev))
 +      if (ipoib_ib_dev_open(dev)) {
 +              napi_disable(&priv->napi);
                return -EINVAL;
 +      }
  
        if (ipoib_ib_dev_up(dev)) {
                ipoib_ib_dev_stop(dev, 1);
 +              napi_disable(&priv->napi);
                return -EINVAL;
        }
  
@@@ -144,7 -140,6 +144,7 @@@ static int ipoib_stop(struct net_devic
        ipoib_dbg(priv, "stopping interface\n");
  
        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 +      napi_disable(&priv->napi);
  
        netif_stop_queue(dev);
  
@@@ -473,9 -468,10 +473,10 @@@ static struct ipoib_path *path_rec_crea
        INIT_LIST_HEAD(&path->neigh_list);
  
        memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
-       path->pathrec.sgid      = priv->local_gid;
-       path->pathrec.pkey      = cpu_to_be16(priv->pkey);
-       path->pathrec.numb_path = 1;
+       path->pathrec.sgid          = priv->local_gid;
+       path->pathrec.pkey          = cpu_to_be16(priv->pkey);
+       path->pathrec.numb_path     = 1;
+       path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
  
        return path;
  }
@@@ -496,6 -492,7 +497,7 @@@ static int path_rec_start(struct net_de
                                   IB_SA_PATH_REC_DGID          |
                                   IB_SA_PATH_REC_SGID          |
                                   IB_SA_PATH_REC_NUMB_PATH     |
+                                  IB_SA_PATH_REC_TRAFFIC_CLASS |
                                   IB_SA_PATH_REC_PKEY,
                                   1000, GFP_ATOMIC,
                                   path_rec_completion,
@@@ -517,7 -514,7 +519,7 @@@ static void neigh_add_path(struct sk_bu
  
        neigh = ipoib_neigh_alloc(skb->dst->neighbour);
        if (!neigh) {
 -              ++priv->stats.tx_dropped;
 +              ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
                return;
        }
@@@ -582,7 -579,7 +584,7 @@@ err_list
  err_path:
        ipoib_neigh_free(dev, neigh);
  err_drop:
 -      ++priv->stats.tx_dropped;
 +      ++dev->stats.tx_dropped;
        dev_kfree_skb_any(skb);
  
        spin_unlock(&priv->lock);
@@@ -631,7 -628,7 +633,7 @@@ static void unicast_arp_send(struct sk_
                        } else
                                __path_add(dev, path);
                } else {
 -                      ++priv->stats.tx_dropped;
 +                      ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb);
                }
  
                skb_push(skb, sizeof *phdr);
                __skb_queue_tail(&path->queue, skb);
        } else {
 -              ++priv->stats.tx_dropped;
 +              ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
        }
  
@@@ -718,7 -715,7 +720,7 @@@ static int ipoib_start_xmit(struct sk_b
                        __skb_queue_tail(&neigh->queue, skb);
                        spin_unlock(&priv->lock);
                } else {
 -                      ++priv->stats.tx_dropped;
 +                      ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb);
                }
        } else {
                                           IPOIB_QPN(phdr->hwaddr),
                                           IPOIB_GID_RAW_ARG(phdr->hwaddr + 4));
                                dev_kfree_skb_any(skb);
 -                              ++priv->stats.tx_dropped;
 +                              ++dev->stats.tx_dropped;
                                goto out;
                        }
  
@@@ -758,6 -755,13 +760,6 @@@ out
        return NETDEV_TX_OK;
  }
  
 -static struct net_device_stats *ipoib_get_stats(struct net_device *dev)
 -{
 -      struct ipoib_dev_priv *priv = netdev_priv(dev);
 -
 -      return &priv->stats;
 -}
 -
  static void ipoib_timeout(struct net_device *dev)
  {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
  static int ipoib_hard_header(struct sk_buff *skb,
                             struct net_device *dev,
                             unsigned short type,
 -                           void *daddr, void *saddr, unsigned len)
 +                           const void *daddr, const void *saddr, unsigned len)
  {
        struct ipoib_header *header;
  
@@@ -854,10 -858,11 +856,10 @@@ struct ipoib_neigh *ipoib_neigh_alloc(s
  
  void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
  {
 -      struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct sk_buff *skb;
        *to_ipoib_neigh(neigh->neighbour) = NULL;
        while ((skb = __skb_dequeue(&neigh->queue))) {
 -              ++priv->stats.tx_dropped;
 +              ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
        }
        if (ipoib_cm_get(neigh))
@@@ -932,10 -937,6 +934,10 @@@ void ipoib_dev_cleanup(struct net_devic
        priv->tx_ring = NULL;
  }
  
 +static const struct header_ops ipoib_header_ops = {
 +      .create = ipoib_hard_header,
 +};
 +
  static void ipoib_setup(struct net_device *dev)
  {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        dev->stop                = ipoib_stop;
        dev->change_mtu          = ipoib_change_mtu;
        dev->hard_start_xmit     = ipoib_start_xmit;
 -      dev->get_stats           = ipoib_get_stats;
        dev->tx_timeout          = ipoib_timeout;
 -      dev->hard_header         = ipoib_hard_header;
 +      dev->header_ops          = &ipoib_header_ops;
        dev->set_multicast_list  = ipoib_set_mcast_list;
        dev->neigh_setup         = ipoib_neigh_setup_dev;
 -      dev->poll                = ipoib_poll;
 -      dev->weight              = 100;
 +
 +      netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
  
        dev->watchdog_timeo      = HZ;
  
  
        netif_carrier_off(dev);
  
 -      SET_MODULE_OWNER(dev);
 -
        priv->dev = dev;
  
        spin_lock_init(&priv->lock);
@@@ -1015,6 -1019,37 +1017,37 @@@ static ssize_t show_pkey(struct device 
  }
  static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
  
+ static ssize_t show_umcast(struct device *dev,
+                          struct device_attribute *attr, char *buf)
+ {
+       struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+       return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
+ }
+ static ssize_t set_umcast(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf, size_t count)
+ {
+       struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+       unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
+       if (umcast_val > 0) {
+               set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+               ipoib_warn(priv, "ignoring multicast groups joined directly "
+                               "by userspace\n");
+       } else
+               clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+       return count;
+ }
+ static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
+ int ipoib_add_umcast_attr(struct net_device *dev)
+ {
+       return device_create_file(&dev->dev, &dev_attr_umcast);
+ }
  static ssize_t create_child(struct device *dev,
                            struct device_attribute *attr,
                            const char *buf, size_t count)
@@@ -1081,7 -1116,7 +1114,7 @@@ static struct net_device *ipoib_add_por
        if (result) {
                printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
                       hca->name, port, result);
-               goto alloc_mem_failed;
+               goto device_init_failed;
        }
  
        /*
        if (result) {
                printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
                       hca->name, port, result);
-               goto alloc_mem_failed;
+               goto device_init_failed;
        } else
                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
  
                goto sysfs_failed;
        if (ipoib_add_pkey_attr(priv->dev))
                goto sysfs_failed;
+       if (ipoib_add_umcast_attr(priv->dev))
+               goto sysfs_failed;
        if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
                goto sysfs_failed;
        if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
index 98e904a7f3e8f3226f6119a98eb8a22cdf7bd77f,62abfb6f35c1f9270ced7bb2920287c59904c6fa..827820ec66d1f3db03cb3a23698ef0244176d9e2
@@@ -57,28 -57,6 +57,6 @@@ MODULE_PARM_DESC(mcast_debug_level
  
  static DEFINE_MUTEX(mcast_mutex);
  
- /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
- struct ipoib_mcast {
-       struct ib_sa_mcmember_rec mcmember;
-       struct ib_sa_multicast   *mc;
-       struct ipoib_ah          *ah;
-       struct rb_node    rb_node;
-       struct list_head  list;
-       unsigned long created;
-       unsigned long backoff;
-       unsigned long flags;
-       unsigned char logcount;
-       struct list_head  neigh_list;
-       struct sk_buff_head pkt_queue;
-       struct net_device *dev;
- };
  struct ipoib_mcast_iter {
        struct net_device *dev;
        union ib_gid       mgid;
@@@ -125,7 -103,7 +103,7 @@@ static void ipoib_mcast_free(struct ipo
        }
  
        spin_lock_irqsave(&priv->tx_lock, flags);
 -      priv->stats.tx_dropped += tx_dropped;
 +      dev->stats.tx_dropped += tx_dropped;
        spin_unlock_irqrestore(&priv->tx_lock, flags);
  
        kfree(mcast);
@@@ -320,7 -298,7 +298,7 @@@ ipoib_mcast_sendonly_join_complete(int 
                /* Flush out any queued packets */
                spin_lock_irq(&priv->tx_lock);
                while (!skb_queue_empty(&mcast->pkt_queue)) {
 -                      ++priv->stats.tx_dropped;
 +                      ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
                }
                spin_unlock_irq(&priv->tx_lock);
@@@ -675,7 -653,7 +653,7 @@@ void ipoib_mcast_send(struct net_devic
        if (!test_bit(IPOIB_MCAST_STARTED, &priv->flags)        ||
            !priv->broadcast                                    ||
            !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
 -              ++priv->stats.tx_dropped;
 +              ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
                goto unlock;
        }
                if (!mcast) {
                        ipoib_warn(priv, "unable to allocate memory for "
                                   "multicast structure\n");
 -                      ++priv->stats.tx_dropped;
 +                      ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb);
                        goto out;
                }
                if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
                        skb_queue_tail(&mcast->pkt_queue, skb);
                else {
 -                      ++priv->stats.tx_dropped;
 +                      ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb);
                }
  
@@@ -783,6 -761,7 +761,7 @@@ void ipoib_mcast_restart_task(struct wo
        struct ipoib_mcast *mcast, *tmcast;
        LIST_HEAD(remove_list);
        unsigned long flags;
+       struct ib_sa_mcmember_rec rec;
  
        ipoib_dbg_mcast(priv, "restarting multicast task\n");
  
                if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
                        struct ipoib_mcast *nmcast;
  
+                       /* ignore group which is directly joined by userspace */
+                       if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
+                           !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
+                               ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid "
+                                               IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid));
+                               continue;
+                       }
                        /* Not found or send-only group, let's add a new entry */
                        ipoib_dbg_mcast(priv, "adding multicast entry for mgid "
                                        IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid));