3.0 patches

author: Greg Kroah-Hartman <gregkh@suse.de> 2011-08-19 09:58:51 -0700
committer: Greg Kroah-Hartman <gregkh@suse.de> 2011-08-19 09:58:51 -0700
commit: 947bd36fdaa2b980949a83579f04076b7e90dc6a (patch)
tree: 4c93486d66e5c0ff134f2536e2af87075d41e491
parent: 18651efa7d309f1cd2c1ed1a305a654fc645b5a3 (diff)
download: stable-queue-947bd36fdaa2b980949a83579f04076b7e90dc6a.tar.gz
7 files changed, 563 insertions, 0 deletions
diff --git a/queue-3.0/nfsv4.1-fix-the-callback-highest_used_slotid-behaviour.patch b/queue-3.0/nfsv4.1-fix-the-callback-highest_used_slotid-behaviour.patch
new file mode 100644
index 0000000000..3c53bf3500
--- /dev/null
+++ b/queue-3.0/nfsv4.1-fix-the-callback-highest_used_slotid-behaviour.patch
@@ -0,0 +1,179 @@
+From 55a673990ec04cf63005318bcf08c2b0046e5778 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Tue, 2 Aug 2011 14:46:29 -0400
+Subject: NFSv4.1: Fix the callback 'highest_used_slotid' behaviour
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit 55a673990ec04cf63005318bcf08c2b0046e5778 upstream.
+
+Currently, there is no guarantee that we will call nfs4_cb_take_slot() even
+though nfs4_callback_compound() will consistently call
+nfs4_cb_free_slot() provided the cb_process_state has set the 'clp' field.
+The result is that we can trigger the BUG_ON() upon the next call to
+nfs4_cb_take_slot().
+
+This patch fixes the above problem by using the slot id that was taken in
+the CB_SEQUENCE operation as a flag for whether or not we need to call
+nfs4_cb_free_slot().
+It also fixes an atomicity problem: we need to set tbl->highest_used_slotid
+atomically with the check for NFS4_SESSION_DRAINING, otherwise we end up
+racing with the various tests in nfs4_begin_drain_session().
+
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/callback.h      |    2 +-
+ fs/nfs/callback_proc.c |   20 ++++++++++++++------
+ fs/nfs/callback_xdr.c  |   24 +++++++-----------------
+ 3 files changed, 22 insertions(+), 24 deletions(-)
+
+--- a/fs/nfs/callback.h
++++ b/fs/nfs/callback.h
+@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
+ struct cb_process_state {
+ 	__be32			drc_status;
+ 	struct nfs_client	*clp;
++	int			slotid;
+ };
+ 
+ struct cb_compound_hdr_arg {
+@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutreca
+ 	void *dummy, struct cb_process_state *cps);
+ 
+ extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+-extern void nfs4_cb_take_slot(struct nfs_client *clp);
+ 
+ struct cb_devicenotifyitem {
+ 	uint32_t		cbd_notify_type;
+--- a/fs/nfs/callback_proc.c
++++ b/fs/nfs/callback_proc.c
+@@ -333,7 +333,7 @@ validate_seqid(struct nfs4_slot_table *t
+ 	/* Normal */
+ 	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
+ 		slot->seq_nr++;
+-		return htonl(NFS4_OK);
++		goto out_ok;
+ 	}
+ 
+ 	/* Replay */
+@@ -352,11 +352,14 @@ validate_seqid(struct nfs4_slot_table *t
+ 	/* Wraparound */
+ 	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
+ 		slot->seq_nr = 1;
+-		return htonl(NFS4_OK);
++		goto out_ok;
+ 	}
+ 
+ 	/* Misordered request */
+ 	return htonl(NFS4ERR_SEQ_MISORDERED);
++out_ok:
++	tbl->highest_used_slotid = args->csa_slotid;
++	return htonl(NFS4_OK);
+ }
+ 
+ /*
+@@ -418,26 +421,32 @@ __be32 nfs4_callback_sequence(struct cb_
+ 			      struct cb_sequenceres *res,
+ 			      struct cb_process_state *cps)
+ {
++	struct nfs4_slot_table *tbl;
+ 	struct nfs_client *clp;
+ 	int i;
+ 	__be32 status = htonl(NFS4ERR_BADSESSION);
+ 
+-	cps->clp = NULL;
+-
+ 	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
+ 	if (clp == NULL)
+ 		goto out;
+ 
++	tbl = &clp->cl_session->bc_slot_table;
++
++	spin_lock(&tbl->slot_tbl_lock);
+ 	/* state manager is resetting the session */
+ 	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+-		status = NFS4ERR_DELAY;
++		spin_unlock(&tbl->slot_tbl_lock);
++		status = htonl(NFS4ERR_DELAY);
+ 		goto out;
+ 	}
+ 
+ 	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
++	spin_unlock(&tbl->slot_tbl_lock);
+ 	if (status)
+ 		goto out;
+ 
++	cps->slotid = args->csa_slotid;
++
+ 	/*
+ 	 * Check for pending referring calls.  If a match is found, a
+ 	 * related callback was received before the response to the original
+@@ -454,7 +463,6 @@ __be32 nfs4_callback_sequence(struct cb_
+ 	res->csr_slotid = args->csa_slotid;
+ 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+-	nfs4_cb_take_slot(clp);
+ 
+ out:
+ 	cps->clp = clp; /* put in nfs4_callback_compound */
+--- a/fs/nfs/callback_xdr.c
++++ b/fs/nfs/callback_xdr.c
+@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(stru
+ 	 * Let the state manager know callback processing done.
+ 	 * A single slot, so highest used slotid is either 0 or -1
+ 	 */
+-	tbl->highest_used_slotid--;
++	tbl->highest_used_slotid = -1;
+ 	nfs4_check_drain_bc_complete(session);
+ 	spin_unlock(&tbl->slot_tbl_lock);
+ }
+ 
+-static void nfs4_cb_free_slot(struct nfs_client *clp)
++static void nfs4_cb_free_slot(struct cb_process_state *cps)
+ {
+-	if (clp && clp->cl_session)
+-		nfs4_callback_free_slot(clp->cl_session);
+-}
+-
+-/* A single slot, so highest used slotid is either 0 or -1 */
+-void nfs4_cb_take_slot(struct nfs_client *clp)
+-{
+-	struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
+-
+-	spin_lock(&tbl->slot_tbl_lock);
+-	tbl->highest_used_slotid++;
+-	BUG_ON(tbl->highest_used_slotid != 0);
+-	spin_unlock(&tbl->slot_tbl_lock);
++	if (cps->slotid != -1)
++		nfs4_callback_free_slot(cps->clp->cl_session);
+ }
+ 
+ #else /* CONFIG_NFS_V4_1 */
+@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned in
+ 	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
+ 
+-static void nfs4_cb_free_slot(struct nfs_client *clp)
++static void nfs4_cb_free_slot(struct cb_process_state *cps)
+ {
+ }
+ #endif /* CONFIG_NFS_V4_1 */
+@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(str
+ 	struct cb_process_state cps = {
+ 		.drc_status = 0,
+ 		.clp = NULL,
++		.slotid = -1,
+ 	};
+ 	unsigned int nops = 0;
+ 
+@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(str
+ 
+ 	*hdr_res.status = status;
+ 	*hdr_res.nops = htonl(nops);
+-	nfs4_cb_free_slot(cps.clp);
++	nfs4_cb_free_slot(&cps);
+ 	nfs_put_client(cps.clp);
+ 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
+ 	return rpc_success;
diff --git a/queue-3.0/nfsv4.1-return-nfs4err_badsession-to-callbacks-during.patch b/queue-3.0/nfsv4.1-return-nfs4err_badsession-to-callbacks-during.patch
new file mode 100644
index 0000000000..48806c3dfd
--- /dev/null
+++ b/queue-3.0/nfsv4.1-return-nfs4err_badsession-to-callbacks-during.patch
@@ -0,0 +1,44 @@
+From 910ac68a2b80c7de95bc8488734067b1bb15d583 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Tue, 2 Aug 2011 14:46:52 -0400
+Subject: NFSv4.1: Return NFS4ERR_BADSESSION to callbacks during
+ session resets
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit 910ac68a2b80c7de95bc8488734067b1bb15d583 upstream.
+
+If the client is in the process of resetting the session when it receives
+a callback, then returning NFS4ERR_DELAY may cause a deadlock with the
+DESTROY_SESSION call.
+
+Basically, if the client returns NFS4ERR_DELAY in response to the
+CB_SEQUENCE call, then the server is entitled to believe that the
+client is busy because it is already processing that call. In that
+case, the server is perfectly entitled to respond with a
+NFS4ERR_BACK_CHAN_BUSY to any DESTROY_SESSION call.
+
+Fix this by having the client reply with a NFS4ERR_BADSESSION in
+response to the callback if it is resetting the session.
+
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/callback_proc.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/nfs/callback_proc.c
++++ b/fs/nfs/callback_proc.c
+@@ -437,6 +437,11 @@ __be32 nfs4_callback_sequence(struct cb_
+ 	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+ 		spin_unlock(&tbl->slot_tbl_lock);
+ 		status = htonl(NFS4ERR_DELAY);
++		/* Return NFS4ERR_BADSESSION if we're draining the session
++		 * in order to reset it.
++		 */
++		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
++			status = htonl(NFS4ERR_BADSESSION);
+ 		goto out;
+ 	}
+ 
diff --git a/queue-3.0/pata_via-disable-atapi-dma-on-averatec-3200.patch b/queue-3.0/pata_via-disable-atapi-dma-on-averatec-3200.patch
new file mode 100644
index 0000000000..065f37e90f
--- /dev/null
+++ b/queue-3.0/pata_via-disable-atapi-dma-on-averatec-3200.patch
@@ -0,0 +1,59 @@
+From 6d0e194d2eefcaab6dbdca1f639748660144acb5 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Thu, 4 Aug 2011 11:15:07 +0200
+Subject: pata_via: disable ATAPI DMA on AVERATEC 3200
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 6d0e194d2eefcaab6dbdca1f639748660144acb5 upstream.
+
+On AVERATEC 3200, pata_via causes memory corruption with ATAPI DMA,
+which often leads to random kernel oops.  The cause of the problem is
+not well understood yet and only small subset of machines using the
+controller seem affected.  Blacklist ATAPI DMA on the machine.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=11426
+Reported-and-tested-by: Jim Bray <jimsantelmo@gmail.com>
+Cc: Alan Cox <alan@linux.intel.com>
+Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/ata/pata_via.c |   18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/drivers/ata/pata_via.c
++++ b/drivers/ata/pata_via.c
+@@ -124,6 +124,17 @@ static const struct via_isa_bridge {
+ 	{ NULL }
+ };
+ 
++static const struct dmi_system_id no_atapi_dma_dmi_table[] = {
++	{
++		.ident = "AVERATEC 3200",
++		.matches = {
++			DMI_MATCH(DMI_BOARD_VENDOR, "AVERATEC"),
++			DMI_MATCH(DMI_BOARD_NAME, "3200"),
++		},
++	},
++	{ }
++};
++
+ struct via_port {
+ 	u8 cached_device;
+ };
+@@ -355,6 +366,13 @@ static unsigned long via_mode_filter(str
+ 			mask &= ~ ATA_MASK_UDMA;
+ 		}
+ 	}
++
++	if (dev->class == ATA_DEV_ATAPI &&
++	    dmi_check_system(no_atapi_dma_dmi_table)) {
++		ata_dev_warn(dev, "controller locks up on ATAPI DMA, forcing PIO\n");
++		mask &= ATA_MASK_PIO;
++	}
++
+ 	return mask;
+ }
+ 
diff --git a/queue-3.0/pnfs-obj-bug-when-we-are-running-out-of-bio.patch b/queue-3.0/pnfs-obj-bug-when-we-are-running-out-of-bio.patch
new file mode 100644
index 0000000000..a9db245bdf
--- /dev/null
+++ b/queue-3.0/pnfs-obj-bug-when-we-are-running-out-of-bio.patch
@@ -0,0 +1,69 @@
+From 20618b21da0796115e81906d24ff1601552701b7 Mon Sep 17 00:00:00 2001
+From: Boaz Harrosh <bharrosh@panasas.com>
+Date: Wed, 3 Aug 2011 21:54:33 -0700
+Subject: pnfs-obj: Bug when we are running out of bio
+
+From: Boaz Harrosh <bharrosh@panasas.com>
+
+commit 20618b21da0796115e81906d24ff1601552701b7 upstream.
+
+When we have a situation that the number of pages we want
+to encode is bigger then the size of the bio. (Which can
+currently happen only when all IO is going to a single device
+.e.g group_width==1) then the IO is submitted short and we
+report back only the amount of bytes we actually wrote/read
+and all is fine. BUT ...
+
+There was a bug that the current length counter was advanced
+before the fail to add the extra page, and we come to a situation
+that the CDB length was one-page longer then the actual bio size,
+which is of course rejected by the osd-target.
+
+While here also fix the bio size calculation, in the case
+that we received more then one group of devices.
+
+Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/objlayout/objio_osd.c |   12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+--- a/fs/nfs/objlayout/objio_osd.c
++++ b/fs/nfs/objlayout/objio_osd.c
+@@ -587,22 +587,19 @@ static void _calc_stripe_info(struct obj
+ }
+ 
+ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
+-		unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
++		unsigned pgbase, struct _objio_per_comp *per_dev, int len,
+ 		gfp_t gfp_flags)
+ {
+ 	unsigned pg = *cur_pg;
++	int cur_len = len;
+ 	struct request_queue *q =
+ 			osd_request_queue(_io_od(ios, per_dev->dev));
+ 
+-	per_dev->length += cur_len;
+-
+ 	if (per_dev->bio == NULL) {
+-		unsigned stripes = ios->layout->num_comps /
+-						     ios->layout->mirrors_p1;
+-		unsigned pages_in_stripe = stripes *
++		unsigned pages_in_stripe = ios->layout->group_width *
+ 				      (ios->layout->stripe_unit / PAGE_SIZE);
+ 		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+-				    stripes;
++				    ios->layout->group_width;
+ 
+ 		if (BIO_MAX_PAGES_KMALLOC < bio_size)
+ 			bio_size = BIO_MAX_PAGES_KMALLOC;
+@@ -630,6 +627,7 @@ static int _add_stripe_unit(struct objio
+ 	}
+ 	BUG_ON(cur_len);
+ 
++	per_dev->length += len;
+ 	*cur_pg = pg;
+ 	return 0;
+ }
diff --git a/queue-3.0/pnfs-obj-fix-the-comp_index-0-case.patch b/queue-3.0/pnfs-obj-fix-the-comp_index-0-case.patch
new file mode 100644
index 0000000000..6d10741b5a
--- /dev/null
+++ b/queue-3.0/pnfs-obj-fix-the-comp_index-0-case.patch
@@ -0,0 +1,97 @@
+From 9af7db3228acc286c50e3a0f054ec982efdbc6c6 Mon Sep 17 00:00:00 2001
+From: Boaz Harrosh <bharrosh@panasas.com>
+Date: Wed, 3 Aug 2011 21:52:51 -0700
+Subject: pnfs-obj: Fix the comp_index != 0 case
+
+From: Boaz Harrosh <bharrosh@panasas.com>
+
+commit 9af7db3228acc286c50e3a0f054ec982efdbc6c6 upstream.
+
+There were bugs in the case of partial layout where olo_comp_index
+is not zero. This used to work and was tested but one of the later
+cleanup SQUASHMEs broke it and was not tested since.
+
+Also add a dprint that specify those received layout parameters.
+Everything else was already printed.
+
+Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/objlayout/objio_osd.c        |   16 +++++++---------
+ fs/nfs/objlayout/pnfs_osd_xdr_cli.c |    3 +++
+ 2 files changed, 10 insertions(+), 9 deletions(-)
+
+--- a/fs/nfs/objlayout/objio_osd.c
++++ b/fs/nfs/objlayout/objio_osd.c
+@@ -479,7 +479,6 @@ static int _io_check(struct objio_state
+ 	for (i = 0; i <  ios->numdevs; i++) {
+ 		struct osd_sense_info osi;
+ 		struct osd_request *or = ios->per_dev[i].or;
+-		unsigned dev;
+ 		int ret;
+ 
+ 		if (!or)
+@@ -500,9 +499,8 @@ static int _io_check(struct objio_state
+ 
+ 			continue; /* we recovered */
+ 		}
+-		dev = ios->per_dev[i].dev;
+-		objlayout_io_set_result(&ios->ol_state, dev,
+-					&ios->layout->comps[dev].oc_object_id,
++		objlayout_io_set_result(&ios->ol_state, i,
++					&ios->layout->comps[i].oc_object_id,
+ 					osd_pri_2_pnfs_err(osi.osd_err_pri),
+ 					ios->per_dev[i].offset,
+ 					ios->per_dev[i].length,
+@@ -650,7 +648,7 @@ static int _prepare_one_group(struct obj
+ 	int ret = 0;
+ 
+ 	while (length) {
+-		struct _objio_per_comp *per_dev = &ios->per_dev[dev];
++		struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
+ 		unsigned cur_len, page_off = 0;
+ 
+ 		if (!per_dev->length) {
+@@ -670,8 +668,8 @@ static int _prepare_one_group(struct obj
+ 				cur_len = stripe_unit;
+ 			}
+ 
+-			if (max_comp < dev)
+-				max_comp = dev;
++			if (max_comp < dev - first_dev)
++				max_comp = dev - first_dev;
+ 		} else {
+ 			cur_len = stripe_unit;
+ 		}
+@@ -806,7 +804,7 @@ static int _read_mirrors(struct objio_st
+ 	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ 	unsigned dev = per_dev->dev;
+ 	struct pnfs_osd_object_cred *cred =
+-			&ios->layout->comps[dev];
++			&ios->layout->comps[cur_comp];
+ 	struct osd_obj_id obj = {
+ 		.partition = cred->oc_object_id.oid_partition_id,
+ 		.id = cred->oc_object_id.oid_object_id,
+@@ -904,7 +902,7 @@ static int _write_mirrors(struct objio_s
+ 	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+ 		struct osd_request *or = NULL;
+ 		struct pnfs_osd_object_cred *cred =
+-					&ios->layout->comps[dev];
++					&ios->layout->comps[cur_comp];
+ 		struct osd_obj_id obj = {
+ 			.partition = cred->oc_object_id.oid_partition_id,
+ 			.id = cred->oc_object_id.oid_object_id,
+--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
++++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struc
+ 	p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+ 	layout->olo_comps_index = be32_to_cpup(p++);
+ 	layout->olo_num_comps = be32_to_cpup(p++);
++	dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
++		layout->olo_comps_index, layout->olo_num_comps);
++
+ 	iter->total_comps = layout->olo_num_comps;
+ 	return 0;
+ }
diff --git a/queue-3.0/series b/queue-3.0/series
index 7a03840847..07bb57d881 100644
--- a/queue-3.0/series
+++ b/queue-3.0/series
@@ -4,3 +4,9 @@ befs-validate-length-of-long-symbolic-links.patch
 i7core_edac-fixed-typo-in-error-count-calculation.patch
 possible-memory-corruption-on-mount.patch
 x86-intel-power-correct-the-msr_ia32_energy_perf_bias.patch
+pata_via-disable-atapi-dma-on-averatec-3200.patch
+pnfs-obj-fix-the-comp_index-0-case.patch
+pnfs-obj-bug-when-we-are-running-out-of-bio.patch
+nfsv4.1-fix-the-callback-highest_used_slotid-behaviour.patch
+nfsv4.1-return-nfs4err_badsession-to-callbacks-during.patch
+x86-mtrr-lock-stop-machine-during-mtrr-rendezvous-sequence.patch
diff --git a/queue-3.0/x86-mtrr-lock-stop-machine-during-mtrr-rendezvous-sequence.patch b/queue-3.0/x86-mtrr-lock-stop-machine-during-mtrr-rendezvous-sequence.patch
new file mode 100644
index 0000000000..02094b1768
--- /dev/null
+++ b/queue-3.0/x86-mtrr-lock-stop-machine-during-mtrr-rendezvous-sequence.patch
@@ -0,0 +1,109 @@
+From 6d3321e8e2b3bf6a5892e2ef673c7bf536e3f904 Mon Sep 17 00:00:00 2001
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+Date: Thu, 23 Jun 2011 11:19:26 -0700
+Subject: x86, mtrr: lock stop machine during MTRR rendezvous sequence
+
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+
+commit 6d3321e8e2b3bf6a5892e2ef673c7bf536e3f904 upstream.
+
+MTRR rendezvous sequence using stop_one_cpu_nowait() can potentially
+happen in parallel with another system wide rendezvous using
+stop_machine(). This can lead to deadlock (The order in which
+works are queued can be different on different cpu's. Some cpu's
+will be running the first rendezvous handler and others will be running
+the second rendezvous handler. Each set waiting for the other set to join
+for the system wide rendezvous, leading to a deadlock).
+
+MTRR rendezvous sequence is not implemented using stop_machine() as this
+gets called both from the process context aswell as the cpu online paths
+(where the cpu has not come online and the interrupts are disabled etc).
+stop_machine() works with only online cpus.
+
+For now, take the stop_machine mutex in the MTRR rendezvous sequence that
+gets called from an online cpu (here we are in the process context
+and can potentially sleep while taking the mutex). And the MTRR rendezvous
+that gets triggered during cpu online doesn't need to take this stop_machine
+lock (as the stop_machine() already ensures that there is no cpu hotplug
+going on in parallel by doing get_online_cpus())
+
+    TBD: Pursue a cleaner solution of extending the stop_machine()
+         infrastructure to handle the case where the calling cpu is
+         still not online and use this for MTRR rendezvous sequence.
+
+fixes: https://bugzilla.novell.com/show_bug.cgi?id=672008
+
+Reported-by: Vadim Kotelnikov <vadimuzzz@inbox.ru>
+Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
+Link: http://lkml.kernel.org/r/20110623182056.807230326@sbsiddha-MOBL3.sc.intel.com
+Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/kernel/cpu/mtrr/main.c |   23 +++++++++++++++++++++++
+ include/linux/stop_machine.h    |    2 ++
+ kernel/stop_machine.c           |    2 +-
+ 3 files changed, 26 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/mtrr/main.c
++++ b/arch/x86/kernel/cpu/mtrr/main.c
+@@ -248,6 +248,25 @@ set_mtrr(unsigned int reg, unsigned long
+ 	unsigned long flags;
+ 	int cpu;
+ 
++#ifdef CONFIG_SMP
++	/*
++	 * If this cpu is not yet active, we are in the cpu online path. There
++	 * can be no stop_machine() in parallel, as stop machine ensures this
++	 * by using get_online_cpus(). We can skip taking the stop_cpus_mutex,
++	 * as we don't need it and also we can't afford to block while waiting
++	 * for the mutex.
++	 *
++	 * If this cpu is active, we need to prevent stop_machine() happening
++	 * in parallel by taking the stop cpus mutex.
++	 *
++	 * Also, this is called in the context of cpu online path or in the
++	 * context where cpu hotplug is prevented. So checking the active status
++	 * of the raw_smp_processor_id() is safe.
++	 */
++	if (cpu_active(raw_smp_processor_id()))
++		mutex_lock(&stop_cpus_mutex);
++#endif
++
+ 	preempt_disable();
+ 
+ 	data.smp_reg = reg;
+@@ -330,6 +349,10 @@ set_mtrr(unsigned int reg, unsigned long
+ 
+ 	local_irq_restore(flags);
+ 	preempt_enable();
++#ifdef CONFIG_SMP
++	if (cpu_active(raw_smp_processor_id()))
++		mutex_unlock(&stop_cpus_mutex);
++#endif
+ }
+ 
+ /**
+--- a/include/linux/stop_machine.h
++++ b/include/linux/stop_machine.h
+@@ -27,6 +27,8 @@ struct cpu_stop_work {
+ 	struct cpu_stop_done	*done;
+ };
+ 
++extern struct mutex stop_cpus_mutex;
++
+ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ 			 struct cpu_stop_work *work_buf);
+--- a/kernel/stop_machine.c
++++ b/kernel/stop_machine.c
+@@ -132,8 +132,8 @@ void stop_one_cpu_nowait(unsigned int cp
+ 	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+ }
+ 
++DEFINE_MUTEX(stop_cpus_mutex);
+ /* static data for stop_cpus */
+-static DEFINE_MUTEX(stop_cpus_mutex);
+ static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+ 
+ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
author	Greg Kroah-Hartman <gregkh@suse.de>	2011-08-19 09:58:51 -0700
committer	Greg Kroah-Hartman <gregkh@suse.de>	2011-08-19 09:58:51 -0700
commit	947bd36fdaa2b980949a83579f04076b7e90dc6a (patch)
tree	4c93486d66e5c0ff134f2536e2af87075d41e491
parent	18651efa7d309f1cd2c1ed1a305a654fc645b5a3 (diff)
download	stable-queue-947bd36fdaa2b980949a83579f04076b7e90dc6a.tar.gz