From 37289efe3ee0c0a00b5d8302df9a2b007e65c187 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 30 Mar 2006 15:52:54 +0200 Subject: IB/mad: fix oops in cancel_mads We have seen the following OOPs in cancel_mads, when restarting opensm multiple times: Call Trace: [] show_stack+0x9b/0xb0 [] show_registers+0x11c/0x190 [] die+0xed/0x160 [] do_page_fault+0x3f6/0x5d0 [] error_code+0x4f/0x60 [] cancel_mads+0x128/0x150 [ib_mad] [] unregister_mad_agent+0x11/0x130 [ib_mad] [] ib_unregister_mad_agent+0x12/0x20 [ib_mad] [] ib_umad_close+0xf3/0x130 [ib_umad] [] __fput+0x187/0x1c0 [] fput+0x19/0x20 [] filp_close+0x3a/0x60 [] put_files_struct+0x68/0xa0 [] do_signal+0x47/0x100 [] do_notify_resume+0x3d/0x40 [] work_notifysig+0x13/0x25 We traced this back to local_completions unlocking mad_agent_priv->lock while still keeping a pointer into local_list. A later call to list_del(&local->completion_list) would then corrupt the list. To fix this, remove the entry from local_list after looking it up but before releasing mad_agent_priv->lock, to prevent cancel_mads from finding and freeing it. Signed-off-by: Jack Morgenstein Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ba54c856b0e5b..3a702da83e41b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2311,6 +2311,7 @@ static void local_completions(void *data) local = list_entry(mad_agent_priv->local_list.next, struct ib_mad_local_private, completion_list); + list_del(&local->completion_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (local->mad_priv) { recv_mad_agent = local->recv_mad_agent; @@ -2362,7 +2363,6 @@ local_send_completion: &mad_send_wc); spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&local->completion_list); atomic_dec(&mad_agent_priv->refcount); if (!recv) kmem_cache_free(ib_mad_cache, local->mad_priv); -- cgit 1.2.3-korg From f5545d24b8aa9fccd8071203e83bc9f4b26e17a6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 2 Apr 2006 14:39:19 -0700 Subject: IPoIB: Always build debugging code unless CONFIG_EMBEDDED=y Don't allow CONFIG_INFINIBAND_IPOIB_DEBUG to be disabled unless CONFIG_EMBEDDED is selected. We want users (and especially distros) to have this turned on unless they really need to save space, because by the time we want debugging output, it's usually too late to rebuild a kernel. The debugging output can be controlled at runtime via the debug_level module parameter in sysfs. Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index 8d2e04cac68e0..13d6d01c72c02 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -10,8 +10,9 @@ config INFINIBAND_IPOIB group: . config INFINIBAND_IPOIB_DEBUG - bool "IP-over-InfiniBand debugging" + bool "IP-over-InfiniBand debugging" if EMBEDDED depends on INFINIBAND_IPOIB + default y ---help--- This option causes debugging code to be compiled into the IPoIB driver. The output can be turned on via the -- cgit 1.2.3-korg From 227c939b00cf786b5e2e95fc904518206f478421 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 2 Apr 2006 14:39:20 -0700 Subject: IB/mthca: Always build debugging code unless CONFIG_EMBEDDED=y Change the mthca debugging trace output code so that it can enabled and disabled at runtime with the debug_level module parameter in sysfs. Also, don't allow CONFIG_INFINIBAND_MTHCA_DEBUG to be disabled unless CONFIG_EMBEDDED is selected. We want users (and especially distros) to have this turned on unless they really need to save space, because by the time we want debugging output, it's usually too late to rebuild a kernel. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/Kconfig | 11 ++++++----- drivers/infiniband/hw/mthca/Makefile | 4 ---- drivers/infiniband/hw/mthca/mthca_dev.h | 17 +++++++++++++++-- drivers/infiniband/hw/mthca/mthca_main.c | 8 ++++++++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig index e88be85b3d5ca..9aa5a4468a753 100644 --- a/drivers/infiniband/hw/mthca/Kconfig +++ b/drivers/infiniband/hw/mthca/Kconfig @@ -7,10 +7,11 @@ config INFINIBAND_MTHCA ("Tavor") and the MT25208 PCI Express HCA ("Arbel"). config INFINIBAND_MTHCA_DEBUG - bool "Verbose debugging output" + bool "Verbose debugging output" if EMBEDDED depends on INFINIBAND_MTHCA - default n + default y ---help--- - This option causes the mthca driver produce a bunch of debug - messages. Select this is you are developing the driver or - trying to diagnose a problem. + This option causes debugging code to be compiled into the + mthca driver. The output can be turned on via the + debug_level module parameter (which can also be set after + the driver is loaded through sysfs). diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile index 47ec5a7cba0b6..e388d95d0cf1e 100644 --- a/drivers/infiniband/hw/mthca/Makefile +++ b/drivers/infiniband/hw/mthca/Makefile @@ -1,7 +1,3 @@ -ifdef CONFIG_INFINIBAND_MTHCA_DEBUG -EXTRA_CFLAGS += -DDEBUG -endif - obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index ad52edbefe98b..bb2a9d628d7d8 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -355,8 +355,21 @@ struct mthca_dev { spinlock_t sm_lock; }; -#define mthca_dbg(mdev, format, arg...) \ - dev_dbg(&mdev->pdev->dev, format, ## arg) +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG +extern int mthca_debug_level; + +#define mthca_dbg(mdev, format, arg...) \ + do { \ + if (mthca_debug_level) \ + dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ + } while (0) + +#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0) + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + #define mthca_err(mdev, format, arg...) \ dev_err(&mdev->pdev->dev, format, ## arg) #define mthca_info(mdev, format, arg...) \ diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 266f347c67076..597d7dc7088e7 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -52,6 +52,14 @@ MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG + +int mthca_debug_level = 0; +module_param_named(debug_level, mthca_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + #ifdef CONFIG_PCI_MSI static int msi_x = 0; -- cgit 1.2.3-korg From ce1823f0323be9f38bbe0df229a5bba025404923 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 3 Apr 2006 09:31:04 -0700 Subject: IB/srp: Fix memory leak in options parsing Fix memory leak if parsing destination GID fails. Coverity bug 1042 Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index fd8a95a9c5d3e..5f2b3f6e4c477 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1434,6 +1434,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) p = match_strdup(args); if (strlen(p) != 32) { printk(KERN_WARNING PFX "bad dest GID parameter '%s'\n", p); + kfree(p); goto out; } -- cgit 1.2.3-korg From d2e0655ede1d91c3a586455d03a4a2d57e659830 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 4 Apr 2006 19:59:40 +0300 Subject: IPoIB: Consolidate private neighbour data handling Consolidate IPoIB's private neighbour data handling into ipoib_neigh_alloc() and ipoib_neigh_free(). This will make it easier to keep track of the neighbour structures that IPoIB is handling, and is a nice cleanup of the code: add/remove: 2/1 grow/shrink: 1/8 up/down: 100/-178 (-78) function old new delta ipoib_neigh_alloc - 61 +61 ipoib_neigh_free - 36 +36 ipoib_mcast_join_finish 1288 1291 +3 path_rec_completion 575 573 -2 ipoib_mcast_join_task 664 660 -4 ipoib_neigh_destructor 101 92 -9 ipoib_neigh_setup_dev 14 3 -11 ipoib_neigh_setup 17 - -17 path_free 238 215 -23 ipoib_mcast_free 329 306 -23 ipoib_mcast_send 718 684 -34 neigh_add_path 705 650 -55 Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 3 ++ drivers/infiniband/ulp/ipoib/ipoib_main.c | 41 ++++++++++++++++++-------- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 7 ++--- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index b640107fb732c..374109df73035 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -230,6 +230,9 @@ static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) INFINIBAND_ALEN, sizeof(void *)); } +struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh); +void ipoib_neigh_free(struct ipoib_neigh *neigh); + extern struct workqueue_struct *ipoib_workqueue; /* functions */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9b0bd7c746ca1..8f6607bf42615 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -252,8 +252,8 @@ static void path_free(struct net_device *dev, struct ipoib_path *path) */ if (neigh->ah) ipoib_put_ah(neigh->ah); - *to_ipoib_neigh(neigh->neighbour) = NULL; - kfree(neigh); + + ipoib_neigh_free(neigh); } spin_unlock_irqrestore(&priv->lock, flags); @@ -481,7 +481,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) struct ipoib_path *path; struct ipoib_neigh *neigh; - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + neigh = ipoib_neigh_alloc(skb->dst->neighbour); if (!neigh) { ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -489,8 +489,6 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) } skb_queue_head_init(&neigh->queue); - neigh->neighbour = skb->dst->neighbour; - *to_ipoib_neigh(skb->dst->neighbour) = neigh; /* * We can only be called from ipoib_start_xmit, so we're @@ -503,7 +501,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) path = path_rec_create(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4)); if (!path) - goto err; + goto err_path; __path_add(dev, path); } @@ -521,17 +519,17 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) __skb_queue_tail(&neigh->queue, skb); if (!path->query && path_rec_start(dev, path)) - goto err; + goto err_list; } spin_unlock(&priv->lock); return; -err: - *to_ipoib_neigh(skb->dst->neighbour) = NULL; +err_list: list_del(&neigh->list); - kfree(neigh); +err_path: + ipoib_neigh_free(neigh); ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -763,8 +761,7 @@ static void ipoib_neigh_destructor(struct neighbour *n) if (neigh->ah) ah = neigh->ah; list_del(&neigh->list); - *to_ipoib_neigh(n) = NULL; - kfree(neigh); + ipoib_neigh_free(neigh); } spin_unlock_irqrestore(&priv->lock, flags); @@ -773,6 +770,26 @@ static void ipoib_neigh_destructor(struct neighbour *n) ipoib_put_ah(ah); } +struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) +{ + struct ipoib_neigh *neigh; + + neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + if (!neigh) + return NULL; + + neigh->neighbour = neighbour; + *to_ipoib_neigh(neighbour) = neigh; + + return neigh; +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + *to_ipoib_neigh(neigh->neighbour) = NULL; + kfree(neigh); +} + static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) { parms->neigh_destructor = ipoib_neigh_destructor; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 93c462eaf4fd7..a8395ef06c172 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -114,8 +114,7 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast) */ if (neigh->ah) ipoib_put_ah(neigh->ah); - *to_ipoib_neigh(neigh->neighbour) = NULL; - kfree(neigh); + ipoib_neigh_free(neigh); } spin_unlock_irqrestore(&priv->lock, flags); @@ -772,13 +771,11 @@ out: if (skb->dst && skb->dst->neighbour && !*to_ipoib_neigh(skb->dst->neighbour)) { - struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour); if (neigh) { kref_get(&mcast->ah->ref); neigh->ah = mcast->ah; - neigh->neighbour = skb->dst->neighbour; - *to_ipoib_neigh(skb->dst->neighbour) = neigh; list_add_tail(&neigh->list, &mcast->neigh_list); } } -- cgit 1.2.3-korg From bf6a9e31cfa768ce0a8e18474b3ca808641d9243 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 10 Apr 2006 09:43:47 -0700 Subject: IB: simplify static rate encoding Push translation of static rate to HCA format into low-level drivers, where it belongs. For static rate encoding, use encoding of rate field from IB standard PathRecord, with addition of value 0, for backwards compatibility with current usage. The changes are: - Add enum ib_rate to midlayer includes. - Get rid of static rate translation in IPoIB; just use static rate directly from Path and MulticastGroup records. - Update mthca driver to translate absolute static rate into the format used by hardware. This also fixes mthca's static rate handling for HCAs that are capable of 4X DDR. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/core/verbs.c | 34 +++++++++ drivers/infiniband/hw/mthca/mthca_av.c | 100 ++++++++++++++++++++++++- drivers/infiniband/hw/mthca/mthca_cmd.c | 4 + drivers/infiniband/hw/mthca/mthca_cmd.h | 1 + drivers/infiniband/hw/mthca/mthca_dev.h | 4 + drivers/infiniband/hw/mthca/mthca_mad.c | 42 ++++++++++- drivers/infiniband/hw/mthca/mthca_main.c | 12 +++ drivers/infiniband/hw/mthca/mthca_provider.h | 3 +- drivers/infiniband/hw/mthca/mthca_qp.c | 46 ++++++++---- drivers/infiniband/ulp/ipoib/ipoib_fs.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 11 +-- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 10 +-- include/rdma/ib_sa.h | 28 ------- include/rdma/ib_verbs.h | 28 +++++++ 14 files changed, 261 insertions(+), 64 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index cae0845f472ac..b78e7dc693307 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -45,6 +45,40 @@ #include #include +int ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mult); + +enum ib_rate mult_to_ib_rate(int mult) +{ + switch (mult) { + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} +EXPORT_SYMBOL(mult_to_ib_rate); + /* Protection domains */ struct ib_pd *ib_alloc_pd(struct ib_device *device) diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c index bc5bdcbe51b56..b12aa03be2511 100644 --- a/drivers/infiniband/hw/mthca/mthca_av.c +++ b/drivers/infiniband/hw/mthca/mthca_av.c @@ -42,6 +42,20 @@ #include "mthca_dev.h" +enum { + MTHCA_RATE_TAVOR_FULL = 0, + MTHCA_RATE_TAVOR_1X = 1, + MTHCA_RATE_TAVOR_4X = 2, + MTHCA_RATE_TAVOR_1X_DDR = 3 +}; + +enum { + MTHCA_RATE_MEMFREE_FULL = 0, + MTHCA_RATE_MEMFREE_QUARTER = 1, + MTHCA_RATE_MEMFREE_EIGHTH = 2, + MTHCA_RATE_MEMFREE_HALF = 3 +}; + struct mthca_av { __be32 port_pd; u8 reserved1; @@ -55,6 +69,90 @@ struct mthca_av { __be32 dgid[4]; }; +static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_MEMFREE_EIGHTH: + return mult_to_ib_rate(port_rate >> 3); + case MTHCA_RATE_MEMFREE_QUARTER: + return mult_to_ib_rate(port_rate >> 2); + case MTHCA_RATE_MEMFREE_HALF: + return mult_to_ib_rate(port_rate >> 1); + case MTHCA_RATE_MEMFREE_FULL: + default: + return mult_to_ib_rate(port_rate); + } +} + +static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_TAVOR_1X: return IB_RATE_2_5_GBPS; + case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS; + case MTHCA_RATE_TAVOR_4X: return IB_RATE_10_GBPS; + default: return port_rate; + } +} + +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port) +{ + if (mthca_is_memfree(dev)) { + /* Handle old Arbel FW */ + if (dev->limits.stat_rate_support == 0x3 && mthca_rate) + return IB_RATE_2_5_GBPS; + + return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]); + } else + return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]); +} + +static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate) +{ + if (cur_rate <= req_rate) + return 0; + + /* + * Inter-packet delay (IPD) to get from rate X down to a rate + * no more than Y is (X - 1) / Y. + */ + switch ((cur_rate - 1) / req_rate) { + case 0: return MTHCA_RATE_MEMFREE_FULL; + case 1: return MTHCA_RATE_MEMFREE_HALF; + case 2: /* fall through */ + case 3: return MTHCA_RATE_MEMFREE_QUARTER; + default: return MTHCA_RATE_MEMFREE_EIGHTH; + } +} + +static u8 ib_rate_to_tavor(u8 static_rate) +{ + switch (static_rate) { + case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X; + case IB_RATE_5_GBPS: return MTHCA_RATE_TAVOR_1X_DDR; + case IB_RATE_10_GBPS: return MTHCA_RATE_TAVOR_4X; + default: return MTHCA_RATE_TAVOR_FULL; + } +} + +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port) +{ + u8 rate; + + if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1]) + return 0; + + if (mthca_is_memfree(dev)) + rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate), + dev->rate[port - 1]); + else + rate = ib_rate_to_tavor(static_rate); + + if (!(dev->limits.stat_rate_support & (1 << rate))) + rate = 1; + + return rate; +} + int mthca_create_ah(struct mthca_dev *dev, struct mthca_pd *pd, struct ib_ah_attr *ah_attr, @@ -107,7 +205,7 @@ on_hca_fail: av->g_slid = ah_attr->src_path_bits; av->dlid = cpu_to_be16(ah_attr->dlid); av->msg_sr = (3 << 4) | /* 2K message */ - ah_attr->static_rate; + mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num); av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); if (ah_attr->ah_flags & IB_AH_GRH) { av->g_slid |= 0x80; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 343eca507870c..1985b5dfa481b 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -965,6 +965,7 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, u32 *outbox; u8 field; u16 size; + u16 stat_rate; int err; #define QUERY_DEV_LIM_OUT_SIZE 0x100 @@ -995,6 +996,7 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, #define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36 #define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37 #define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b +#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET 0x3c #define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_LIM_FLAGS_OFFSET 0x44 #define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48 @@ -1086,6 +1088,8 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, dev_lim->num_ports = field & 0xf; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET); dev_lim->max_gids = 1 << (field & 0xf); + MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET); + dev_lim->stat_rate_support = stat_rate; MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET); dev_lim->max_pkeys = 1 << (field & 0xf); MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET); diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h index e4ec35c40dd3c..2f976f2051d6a 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.h +++ b/drivers/infiniband/hw/mthca/mthca_cmd.h @@ -146,6 +146,7 @@ struct mthca_dev_lim { int max_vl; int num_ports; int max_gids; + u16 stat_rate_support; int max_pkeys; u32 flags; int reserved_uars; diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index bb2a9d628d7d8..49d0eae5518a2 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -172,6 +172,7 @@ struct mthca_limits { int reserved_pds; u32 page_size_cap; u32 flags; + u16 stat_rate_support; u8 port_width_cap; }; @@ -353,6 +354,7 @@ struct mthca_dev { struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2]; struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; spinlock_t sm_lock; + u8 rate[MTHCA_MAX_PORTS]; }; #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG @@ -555,6 +557,8 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, struct ib_ud_header *header); int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr); int mthca_ah_grh_present(struct mthca_ah *ah); +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port); +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port); int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index dfb482eac9a25..f235c7ea42f07 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -49,6 +49,30 @@ enum { MTHCA_VENDOR_CLASS2 = 0xa }; +int mthca_update_rate(struct mthca_dev *dev, u8 port_num) +{ + struct ib_port_attr *tprops = NULL; + int ret; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return -ENOMEM; + + ret = ib_query_port(&dev->ib_dev, port_num, tprops); + if (ret) { + printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n", + ret, dev->ib_dev.name, port_num); + goto out; + } + + dev->rate[port_num - 1] = tprops->active_speed * + ib_width_enum_to_int(tprops->active_width); + +out: + kfree(tprops); + return ret; +} + static void update_sm_ah(struct mthca_dev *dev, u8 port_num, u16 lid, u8 sl) { @@ -90,6 +114,7 @@ static void smp_snoop(struct ib_device *ibdev, mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_SET) { if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { + mthca_update_rate(to_mdev(ibdev), port_num); update_sm_ah(to_mdev(ibdev), port_num, be16_to_cpup((__be16 *) (mad->data + 58)), (*(u8 *) (mad->data + 76)) & 0xf); @@ -246,6 +271,7 @@ int mthca_create_agents(struct mthca_dev *dev) { struct ib_mad_agent *agent; int p, q; + int ret; spin_lock_init(&dev->sm_lock); @@ -255,11 +281,23 @@ int mthca_create_agents(struct mthca_dev *dev) q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, NULL, NULL); - if (IS_ERR(agent)) + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); goto err; + } dev->send_agent[p][q] = agent; } + + for (p = 1; p <= dev->limits.num_ports; ++p) { + ret = mthca_update_rate(dev, p); + if (ret) { + mthca_err(dev, "Failed to obtain port %d rate." + " aborting.\n", p); + goto err; + } + } + return 0; err: @@ -268,7 +306,7 @@ err: if (dev->send_agent[p][q]) ib_unregister_mad_agent(dev->send_agent[p][q]); - return PTR_ERR(agent); + return ret; } void __devexit mthca_free_agents(struct mthca_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 597d7dc7088e7..0dc5b8da0007d 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -199,6 +199,18 @@ static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim mdev->limits.port_width_cap = dev_lim->max_port_width; mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1); mdev->limits.flags = dev_lim->flags; + /* + * For old FW that doesn't return static rate support, use a + * value of 0x3 (only static rate values of 0 or 1 are handled), + * except on Sinai, where even old FW can handle static rate + * values of 2 and 3. + */ + if (dev_lim->stat_rate_support) + mdev->limits.stat_rate_support = dev_lim->stat_rate_support; + else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + mdev->limits.stat_rate_support = 0xf; + else + mdev->limits.stat_rate_support = 0x3; /* IB_DEVICE_RESIZE_MAX_WR not supported by driver. May be doable since hardware supports it for SRQ. diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 2e7f521369651..6676a786d6909 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -257,6 +257,8 @@ struct mthca_qp { atomic_t refcount; u32 qpn; int is_direct; + u8 port; /* for SQP and memfree use only */ + u8 alt_port; /* for memfree use only */ u8 transport; u8 state; u8 atomic_rd_en; @@ -278,7 +280,6 @@ struct mthca_qp { struct mthca_sqp { struct mthca_qp qp; - int port; int pkey_index; u32 qkey; u32 send_psn; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 057c8e6af87b3..f37b0e3673230 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -248,6 +248,9 @@ void mthca_qp_event(struct mthca_dev *dev, u32 qpn, return; } + if (event_type == IB_EVENT_PATH_MIG) + qp->port = qp->alt_port; + event.device = &dev->ib_dev; event.event = event_type; event.element.qp = &qp->ibqp; @@ -392,10 +395,16 @@ static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, { memset(ib_ah_attr, 0, sizeof *path); ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) + return; + ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->sl = be32_to_cpu(path->sl_tclass_flowlabel) >> 28; ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f; - ib_ah_attr->static_rate = path->static_rate & 0x7; + ib_ah_attr->static_rate = mthca_rate_to_ib(dev, + path->static_rate & 0x7, + ib_ah_attr->port_num); ib_ah_attr->ah_flags = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1); @@ -455,8 +464,10 @@ int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_m qp_attr->cap.max_recv_sge = qp->rq.max_gs; qp_attr->cap.max_inline_data = qp->max_inline_data; - to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); - to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + if (qp->transport == RC || qp->transport == UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + } qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f; qp_attr->alt_pkey_index = be32_to_cpu(context->alt_path.port_pkey) & 0x7f; @@ -484,11 +495,11 @@ out: } static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah, - struct mthca_qp_path *path) + struct mthca_qp_path *path, u8 port) { path->g_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); - path->static_rate = !!ah->static_rate; + path->static_rate = mthca_get_rate(dev, ah->static_rate, port); if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->limits.gid_table_len) { @@ -634,7 +645,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (qp->transport == MLX) qp_context->pri_path.port_pkey |= - cpu_to_be32(to_msqp(qp)->port << 24); + cpu_to_be32(qp->port << 24); else { if (attr_mask & IB_QP_PORT) { qp_context->pri_path.port_pkey |= @@ -657,7 +668,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) } if (attr_mask & IB_QP_AV) { - if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path)) + if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) return -EINVAL; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); @@ -681,7 +693,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) return -EINVAL; } - if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path)) + if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, + attr->alt_ah_attr.port_num)) return -EINVAL; qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | @@ -791,6 +804,10 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; if (is_sqp(dev, qp)) store_attrs(to_msqp(qp), attr, attr_mask); @@ -802,13 +819,13 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) - init_port(dev, to_msqp(qp)->port); + init_port(dev, qp->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) - mthca_CLOSE_IB(dev, to_msqp(qp)->port, &status); + mthca_CLOSE_IB(dev, qp->port, &status); } /* @@ -1212,6 +1229,9 @@ int mthca_alloc_qp(struct mthca_dev *dev, if (qp->qpn == -1) return -ENOMEM; + /* initialize port to zero for error-catching. */ + qp->port = 0; + err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, send_policy, qp); if (err) { @@ -1261,7 +1281,7 @@ int mthca_alloc_sqp(struct mthca_dev *dev, if (err) goto err_out; - sqp->port = port; + sqp->qp.port = port; sqp->qp.qpn = mqpn; sqp->qp.transport = MLX; @@ -1404,10 +1424,10 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); if (!sqp->qp.ibqp.qp_num) - ib_get_cached_pkey(&dev->ib_dev, sqp->port, + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); else - ib_get_cached_pkey(&dev->ib_dev, sqp->port, + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); sqp->ud_header.bth.pkey = cpu_to_be16(pkey); sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 685258e34034c..5dde380e8dbe9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -213,7 +213,7 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) gid_buf, path.pathrec.dlid ? "yes" : "no"); if (path.pathrec.dlid) { - rate = ib_sa_rate_enum_to_int(path.pathrec.rate) * 25; + rate = ib_rate_to_mult(path.pathrec.rate) * 25; seq_printf(file, " DLID: 0x%04x\n" diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8f6607bf42615..9cb9e430aaafa 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -373,16 +373,9 @@ static void path_rec_completion(int status, struct ib_ah_attr av = { .dlid = be16_to_cpu(pathrec->dlid), .sl = pathrec->sl, - .port_num = priv->port + .port_num = priv->port, + .static_rate = pathrec->rate }; - int path_rate = ib_sa_rate_enum_to_int(pathrec->rate); - - if (path_rate > 0 && priv->local_rate > path_rate) - av.static_rate = (priv->local_rate - 1) / path_rate; - - ipoib_dbg(priv, "static_rate %d for local port %dX, path %dX\n", - av.static_rate, priv->local_rate, - ib_sa_rate_enum_to_int(pathrec->rate)); ah = ipoib_create_ah(dev, priv->pd, &av); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index a8395ef06c172..07b9826b51931 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -250,6 +250,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, .port_num = priv->port, .sl = mcast->mcmember.sl, .ah_flags = IB_AH_GRH, + .static_rate = mcast->mcmember.rate, .grh = { .flow_label = be32_to_cpu(mcast->mcmember.flow_label), .hop_limit = mcast->mcmember.hop_limit, @@ -257,17 +258,8 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, .traffic_class = mcast->mcmember.traffic_class } }; - int path_rate = ib_sa_rate_enum_to_int(mcast->mcmember.rate); - av.grh.dgid = mcast->mcmember.mgid; - if (path_rate > 0 && priv->local_rate > path_rate) - av.static_rate = (priv->local_rate - 1) / path_rate; - - ipoib_dbg_mcast(priv, "static_rate %d for local port %dX, mcmember %dX\n", - av.static_rate, priv->local_rate, - ib_sa_rate_enum_to_int(mcast->mcmember.rate)); - ah = ipoib_create_ah(dev, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index f404fe21cc216..ad63c215efe5b 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -91,34 +91,6 @@ enum ib_sa_selector { IB_SA_BEST = 3 }; -enum ib_sa_rate { - IB_SA_RATE_2_5_GBPS = 2, - IB_SA_RATE_5_GBPS = 5, - IB_SA_RATE_10_GBPS = 3, - IB_SA_RATE_20_GBPS = 6, - IB_SA_RATE_30_GBPS = 4, - IB_SA_RATE_40_GBPS = 7, - IB_SA_RATE_60_GBPS = 8, - IB_SA_RATE_80_GBPS = 9, - IB_SA_RATE_120_GBPS = 10 -}; - -static inline int ib_sa_rate_enum_to_int(enum ib_sa_rate rate) -{ - switch (rate) { - case IB_SA_RATE_2_5_GBPS: return 1; - case IB_SA_RATE_5_GBPS: return 2; - case IB_SA_RATE_10_GBPS: return 4; - case IB_SA_RATE_20_GBPS: return 8; - case IB_SA_RATE_30_GBPS: return 12; - case IB_SA_RATE_40_GBPS: return 16; - case IB_SA_RATE_60_GBPS: return 24; - case IB_SA_RATE_80_GBPS: return 32; - case IB_SA_RATE_120_GBPS: return 48; - default: return -1; - } -} - /* * Structures for SA records are named "struct ib_sa_xxx_rec." No * attempt is made to pack structures to match the physical layout of diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c1ad6273ac6ca..6bbf1b3644005 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -314,6 +314,34 @@ enum ib_ah_flags { IB_AH_GRH = 1 }; +enum ib_rate { + IB_RATE_PORT_CURRENT = 0, + IB_RATE_2_5_GBPS = 2, + IB_RATE_5_GBPS = 5, + IB_RATE_10_GBPS = 3, + IB_RATE_20_GBPS = 6, + IB_RATE_30_GBPS = 4, + IB_RATE_40_GBPS = 7, + IB_RATE_60_GBPS = 8, + IB_RATE_80_GBPS = 9, + IB_RATE_120_GBPS = 10 +}; + +/** + * ib_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; + +/** + * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate + * enum. + * @mult: multiple to convert. + */ +enum ib_rate mult_to_ib_rate(int mult) __attribute_const__; + struct ib_ah_attr { struct ib_global_route grh; u16 dlid; -- cgit 1.2.3-korg From f2de3b06126ddb07d0e4617225d74dce0855add3 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 5 Apr 2006 14:59:40 +0300 Subject: IPoIB: Wait for join to finish before freeing mcast struct ipoib_mcast_restart_task() might free an mcast object while a join request is still outstanding, leading to an oops when the query completes. Fix this by waiting for query to complete, similar to what ipoib_stop_thread() is doing. The wait for mcast completion code is consolidated in wait_for_mcast_join(). Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 41 +++++++++++++------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 07b9826b51931..1dae4b238252d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -609,6 +609,22 @@ int ipoib_mcast_start_thread(struct net_device *dev) return 0; } +static void wait_for_mcast_join(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast) +{ + spin_lock_irq(&priv->lock); + if (mcast && mcast->query) { + ib_sa_cancel_query(mcast->query_id, mcast->query); + mcast->query = NULL; + spin_unlock_irq(&priv->lock); + ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + wait_for_completion(&mcast->done); + } + else + spin_unlock_irq(&priv->lock); +} + int ipoib_mcast_stop_thread(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -628,28 +644,10 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) if (flush) flush_workqueue(ipoib_workqueue); - spin_lock_irq(&priv->lock); - if (priv->broadcast && priv->broadcast->query) { - ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query); - priv->broadcast->query = NULL; - spin_unlock_irq(&priv->lock); - ipoib_dbg_mcast(priv, "waiting for bcast\n"); - wait_for_completion(&priv->broadcast->done); - } else - spin_unlock_irq(&priv->lock); + wait_for_mcast_join(priv, priv->broadcast); - list_for_each_entry(mcast, &priv->multicast_list, list) { - spin_lock_irq(&priv->lock); - if (mcast->query) { - ib_sa_cancel_query(mcast->query_id, mcast->query); - mcast->query = NULL; - spin_unlock_irq(&priv->lock); - ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n", - IPOIB_GID_ARG(mcast->mcmember.mgid)); - wait_for_completion(&mcast->done); - } else - spin_unlock_irq(&priv->lock); - } + list_for_each_entry(mcast, &priv->multicast_list, list) + wait_for_mcast_join(priv, mcast); return 0; } @@ -902,6 +900,7 @@ void ipoib_mcast_restart_task(void *dev_ptr) /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + wait_for_mcast_join(priv, mcast); ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); } -- cgit 1.2.3-korg From 0f4852513fb07405ce88da40d8c497060561246e Mon Sep 17 00:00:00 2001 From: Shirley Ma Date: Mon, 10 Apr 2006 09:43:58 -0700 Subject: IPoIB: Make send and receive queue sizes tunable Make IPoIB's send and receive queue sizes tunable via module parameters ("send_queue_size" and "recv_queue_size"). This allows the queue sizes to be enlarged to fix disastrously bad performance on some platforms and workloads, without bloating memory usage when large queues aren't needed. Signed-off-by: Shirley Ma Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 4 ++++ drivers/infiniband/ulp/ipoib/ipoib_ib.c | 22 +++++++++++----------- drivers/infiniband/ulp/ipoib/ipoib_main.c | 28 ++++++++++++++++++++++------ drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 6 +++--- 4 files changed, 40 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 374109df73035..12a1e0572ef20 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -65,6 +65,8 @@ enum { IPOIB_RX_RING_SIZE = 128, IPOIB_TX_RING_SIZE = 64, + IPOIB_MAX_QUEUE_SIZE = 8192, + IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_NUM_WC = 4, @@ -332,6 +334,8 @@ static inline void ipoib_unregister_debugfs(void) { } #define ipoib_warn(priv, format, arg...) \ ipoib_printk(KERN_WARNING, priv, format , ## arg) +extern int ipoib_sendq_size; +extern int ipoib_recvq_size; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG extern int ipoib_debug_level; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index ed65202878d8f..a54da42849ae1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -161,7 +161,7 @@ static int ipoib_ib_post_receives(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); int i; - for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) { + for (i = 0; i < ipoib_recvq_size; ++i) { if (ipoib_alloc_rx_skb(dev, i)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); return -ENOMEM; @@ -187,7 +187,7 @@ static void ipoib_ib_handle_wc(struct net_device *dev, if (wr_id & IPOIB_OP_RECV) { wr_id &= ~IPOIB_OP_RECV; - if (wr_id < IPOIB_RX_RING_SIZE) { + if (wr_id < ipoib_recvq_size) { struct sk_buff *skb = priv->rx_ring[wr_id].skb; dma_addr_t addr = priv->rx_ring[wr_id].mapping; @@ -252,9 +252,9 @@ static void ipoib_ib_handle_wc(struct net_device *dev, struct ipoib_tx_buf *tx_req; unsigned long flags; - if (wr_id >= IPOIB_TX_RING_SIZE) { + if (wr_id >= ipoib_sendq_size) { ipoib_warn(priv, "completion event with wrid %d (> %d)\n", - wr_id, IPOIB_TX_RING_SIZE); + wr_id, ipoib_sendq_size); return; } @@ -275,7 +275,7 @@ static void ipoib_ib_handle_wc(struct net_device *dev, spin_lock_irqsave(&priv->tx_lock, flags); ++priv->tx_tail; if (netif_queue_stopped(dev) && - priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 2) + priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) netif_wake_queue(dev); spin_unlock_irqrestore(&priv->tx_lock, flags); @@ -344,13 +344,13 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ - tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)]; + tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; tx_req->skb = skb; addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len, DMA_TO_DEVICE); pci_unmap_addr_set(tx_req, mapping, addr); - if (unlikely(post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 1), + if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, addr, skb->len))) { ipoib_warn(priv, "post_send failed\n"); ++priv->stats.tx_errors; @@ -363,7 +363,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, address->last_send = priv->tx_head; ++priv->tx_head; - if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) { + if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); netif_stop_queue(dev); } @@ -488,7 +488,7 @@ static int recvs_pending(struct net_device *dev) int pending = 0; int i; - for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) + for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].skb) ++pending; @@ -527,7 +527,7 @@ int ipoib_ib_dev_stop(struct net_device *dev) */ while ((int) priv->tx_tail - (int) priv->tx_head < 0) { tx_req = &priv->tx_ring[priv->tx_tail & - (IPOIB_TX_RING_SIZE - 1)]; + (ipoib_sendq_size - 1)]; dma_unmap_single(priv->ca->dma_device, pci_unmap_addr(tx_req, mapping), tx_req->skb->len, @@ -536,7 +536,7 @@ int ipoib_ib_dev_stop(struct net_device *dev) ++priv->tx_tail; } - for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) + for (i = 0; i < ipoib_recvq_size; ++i) if (priv->rx_ring[i].skb) { dma_unmap_single(priv->ca->dma_device, pci_unmap_addr(&priv->rx_ring[i], diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9cb9e430aaafa..5bf7e263454bb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include /* For ARPHRD_xxx */ @@ -53,6 +54,14 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); +int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; + +module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; @@ -795,20 +804,19 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) struct ipoib_dev_priv *priv = netdev_priv(dev); /* Allocate RX/TX "rings" to hold queued skbs */ - - priv->rx_ring = kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf), + priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", - ca->name, IPOIB_RX_RING_SIZE); + ca->name, ipoib_recvq_size); goto out; } - priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf), + priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", - ca->name, IPOIB_TX_RING_SIZE); + ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } @@ -876,7 +884,7 @@ static void ipoib_setup(struct net_device *dev) dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; - dev->tx_queue_len = IPOIB_TX_RING_SIZE * 2; + dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; /* MTU will be reset when mcast join happens */ @@ -1128,6 +1136,14 @@ static int __init ipoib_init_module(void) { int ret; + ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); + ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); + + ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); + ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE); + ret = ipoib_register_debugfs(); if (ret) return ret; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 5f0388027b257..1d49d1643c594 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -159,8 +159,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr init_attr = { .cap = { - .max_send_wr = IPOIB_TX_RING_SIZE, - .max_recv_wr = IPOIB_RX_RING_SIZE, + .max_send_wr = ipoib_sendq_size, + .max_recv_wr = ipoib_recvq_size, .max_send_sge = 1, .max_recv_sge = 1 }, @@ -175,7 +175,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) } priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, - IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 1); + ipoib_sendq_size + ipoib_recvq_size + 1); if (IS_ERR(priv->cq)) { printk(KERN_WARNING "%s: failed to create CQ\n", ca->name); goto out_free_pd; -- cgit 1.2.3-korg From abf45dbb5b256dab439ca3b6b71191ecfddf9cb6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 5 Apr 2006 15:47:16 +0300 Subject: IB/mthca: Disable tuning PCI read burst size The PCI spec recommends against drivers playing with a device's PCI read burst size, and says that systems software should configure it. And we actually have users that report that changing it from the default set by BIOS hurts performance and/or stability for them. On the other hand, the Mellanox Programmer's Reference Manual recommends turning it up all the way to the maximum value. Some tests conducted here in the lab do not show performance improvement from this tuning, but this might be just me. As a work-around, make this tuning an option, off by default (safe value), with an eye towards removing it completely one day if no one complains. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 0dc5b8da0007d..7e9c97b094041 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -77,6 +77,10 @@ MODULE_PARM_DESC(msi, "attempt to use MSI if nonzero"); #endif /* CONFIG_PCI_MSI */ +static int tune_pci = 0; +module_param(tune_pci, int, 0444); +MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); + static const char mthca_version[] __devinitdata = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -98,6 +102,9 @@ static int __devinit mthca_tune_pci(struct mthca_dev *mdev) int cap; u16 val; + if (!tune_pci) + return 0; + /* First try to max out Read Byte Count */ cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); if (cap) { -- cgit 1.2.3-korg From a30bb96c6f5aca6513e4dbd94962da03d14b20a9 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 5 Apr 2006 15:59:34 +0300 Subject: IPoIB: Close race in ipoib_flush_paths() ib_sa_cancel_query() must be called with priv->lock held since a completion might arrive and set path->query to NULL. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 5bf7e263454bb..996c6e16a46d4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -346,14 +346,15 @@ void ipoib_flush_paths(struct net_device *dev) list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); - spin_unlock_irqrestore(&priv->lock, flags); - list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); + spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); path_free(dev, path); + spin_lock_irqsave(&priv->lock, flags); } + spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, -- cgit 1.2.3-korg From f697f74a6b189702474b2fd457e1f9365fa213e3 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 10 Apr 2006 09:43:59 -0700 Subject: IPoIB: Use spin_lock_irq() instead of spin_lock_irqsave() We know ipoib_flush_paths() is called from plain process context with interrupts enabled, since it does wait_for_completion(). So there's no need to use spin_lock_irqsave() -- spin_lock_irq() is fine. Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 996c6e16a46d4..cb078a7d0bf5b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -336,9 +336,8 @@ void ipoib_flush_paths(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); - unsigned long flags; - spin_lock_irqsave(&priv->lock, flags); + spin_lock_irq(&priv->lock); list_splice(&priv->path_list, &remove_list); INIT_LIST_HEAD(&priv->path_list); @@ -349,12 +348,12 @@ void ipoib_flush_paths(struct net_device *dev) list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); - spin_unlock_irqrestore(&priv->lock, flags); + spin_unlock_irq(&priv->lock); wait_for_completion(&path->done); path_free(dev, path); - spin_lock_irqsave(&priv->lock, flags); + spin_lock_irq(&priv->lock); } - spin_unlock_irqrestore(&priv->lock, flags); + spin_unlock_irq(&priv->lock); } static void path_rec_completion(int status, -- cgit 1.2.3-korg From ce684df05a531904ea055d01aeee75321fa0db1e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 10 Apr 2006 13:17:43 -0700 Subject: IB/cache: Use correct pointer to calculate size When allocating gid_cache, use kmalloc(sizeof *gid_cache, ...) rather than kmalloc(sizeof *pkey_cache, ...). It doesn't really matter which one is used, since the size ends up the same either way, but it's much better to say what we mean. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/core/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index c57a3871184ca..50364c0b090c7 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -302,7 +302,7 @@ static void ib_cache_setup_one(struct ib_device *device) kmalloc(sizeof *device->cache.pkey_cache * (end_port(device) - start_port(device) + 1), GFP_KERNEL); device->cache.gid_cache = - kmalloc(sizeof *device->cache.pkey_cache * + kmalloc(sizeof *device->cache.gid_cache * (end_port(device) - start_port(device) + 1), GFP_KERNEL); if (!device->cache.pkey_cache || !device->cache.gid_cache) { -- cgit 1.2.3-korg From 59fef3b1e96217c6e736372ff8cc95cbcca1b6aa Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 11 Apr 2006 18:16:27 +0300 Subject: IB/mthca: Fix max_srq_sge returned by ib_query_device for Tavor devices The driver allocates SRQ WQEs size with a power of 2 size both for Tavor and for memfree. For Tavor, however, the hardware only requires the WQE size to be a multiple of 16, not a power of 2, and the max number of scatter-gather allowed is reported accordingly by the firmware (and this is the value currently returned by ib_query_device() and ibv_query_device()). If the max number of scatter/gather entries reported by the FW is used when creating an SRQ, the creation will fail for Tavor, since the required WQE size will be increased to the next power of 2, which turns out to be larger than the device permitted max WQE size (which is not a power of 2). This patch reduces the reported SRQ max wqe size so that it can be used successfully in creating an SRQ on Tavor HCAs. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_dev.h | 2 ++ drivers/infiniband/hw/mthca/mthca_main.c | 1 + drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/mthca/mthca_srq.c | 27 ++++++++++++++++++++++++++- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 49d0eae5518a2..4c1dcb4c18222 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -151,6 +151,7 @@ struct mthca_limits { int reserved_qps; int num_srqs; int max_srq_wqes; + int max_srq_sge; int reserved_srqs; int num_eecs; int reserved_eecs; @@ -507,6 +508,7 @@ void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq); int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask); int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mthca_max_srq_sge(struct mthca_dev *dev); void mthca_srq_event(struct mthca_dev *dev, u32 srqn, enum ib_event_type event_type); void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr); diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 7e9c97b094041..9b9ff7bff357a 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -191,6 +191,7 @@ static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim mdev->limits.reserved_srqs = dev_lim->reserved_srqs; mdev->limits.reserved_eecs = dev_lim->reserved_eecs; mdev->limits.max_desc_sz = dev_lim->max_desc_sz; + mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev); /* * Subtract 1 from the limit because we need to allocate a * spare CQE so the HCA HW can tell the difference between an diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 2c250bc11c332..565a24b1756f1 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -106,7 +106,7 @@ static int mthca_query_device(struct ib_device *ibdev, props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = mdev->limits.num_srqs - mdev->limits.reserved_srqs; props->max_srq_wr = mdev->limits.max_srq_wqes; - props->max_srq_sge = mdev->limits.max_sg; + props->max_srq_sge = mdev->limits.max_srq_sge; props->local_ca_ack_delay = mdev->limits.local_ca_ack_delay; props->atomic_cap = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 2dd3aea053415..adcaf85355ae6 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -192,7 +192,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, /* Sanity check SRQ size before proceeding */ if (attr->max_wr > dev->limits.max_srq_wqes || - attr->max_sge > dev->limits.max_sg) + attr->max_sge > dev->limits.max_srq_sge) return -EINVAL; srq->max = attr->max_wr; @@ -660,6 +660,31 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, return err; } +int mthca_max_srq_sge(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) + return dev->limits.max_sg; + + /* + * SRQ allocations are based on powers of 2 for Tavor, + * (although they only need to be multiples of 16 bytes). + * + * Therefore, we need to base the max number of sg entries on + * the largest power of 2 descriptor size that is <= to the + * actual max WQE descriptor size, rather than return the + * max_sg value given by the firmware (which is based on WQE + * sizes as multiples of 16, not powers of 2). + * + * If SRQ implementation is changed for Tavor to be based on + * multiples of 16, the calculation below can be deleted and + * the FW max_sg value returned. + */ + return min_t(int, dev->limits.max_sg, + ((1 << (fls(dev->limits.max_desc_sz) - 1)) - + sizeof (struct mthca_next_seg)) / + sizeof (struct mthca_data_seg)); +} + int __devinit mthca_init_srq_table(struct mthca_dev *dev) { int err; -- cgit 1.2.3-korg